From c4cdcf0f3065b04349c1456bc6a99e9b0737951a Mon Sep 17 00:00:00 2001 From: pan-x-c Date: Thu, 25 Dec 2025 08:47:23 +0000 Subject: [PATCH 1/7] update to 0.4.0 --- .github/workflows/docker.yaml | 6 +++--- .github/workflows/docker/docker-compose.yaml | 4 ++-- .github/workflows/pre-commit.yaml | 2 +- .github/workflows/unittest.yaml | 8 ++++---- pyproject.toml | 4 +++- scripts/docker/Dockerfile.uv | 6 ++++-- tests/utils/swanlab_test.py | 7 +++---- trinity/__init__.py | 2 +- trinity/common/models/vllm_model.py | 2 +- 9 files changed, 22 insertions(+), 19 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index eb8c15c323..4826a0ab6e 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -21,7 +21,7 @@ env: jobs: build-and-push-image: runs-on: self-hosted - timeout-minutes: 240 # wait up to 4 hours + timeout-minutes: 480 # wait up to 8 hours # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. permissions: contents: read @@ -61,8 +61,8 @@ jobs: with: context: trinity-${{ github.run_id }} push: true - file: trinity-${{ github.run_id }}/scripts/docker/Dockerfile - shm-size: 64g + file: trinity-${{ github.run_id }}/scripts/docker/Dockerfile.uv + shm-size: 128g tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml index b6756f659a..08b8f45d8a 100644 --- a/.github/workflows/docker/docker-compose.yaml +++ b/.github/workflows/docker/docker-compose.yaml @@ -1,6 +1,6 @@ services: trinity-node-1: - image: trinity-rft-unittest:20251030 + image: trinity-rft-unittest:20251225 pull_policy: never command: sh -c "pip install -e .[dev] && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block" environment: @@ -29,7 +29,7 @@ services: capabilities: [gpu] trinity-node-2: - image: trinity-rft-unittest:20251030 + image: trinity-rft-unittest:20251225 pull_policy: never command: sh -c "pip install -e .[dev] && ray start --address=trinity-node-1:6379 --block" environment: diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml index af12d99902..42a88f413d 100644 --- a/.github/workflows/pre-commit.yaml +++ b/.github/workflows/pre-commit.yaml @@ -9,5 +9,5 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.12' - uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 5f1db13eae..9632233b97 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -34,7 +34,7 @@ jobs: MAX_RETRIES=20 RETRY_INTERVAL=5 for i in $(seq 1 $MAX_RETRIES); do - docker compose exec trinity-node-1 ray status && docker compose exec trinity-node-2 ray status && break + docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && ray status" && docker compose exec trinity-node-2 bash -c "source /opt/venv/bin/activate && ray status" && break echo "Waiting for ray cluster to be ready... ($i/$MAX_RETRIES)" sleep $RETRY_INTERVAL if [ "$i" -eq "$MAX_RETRIES" ]; then @@ -76,12 +76,12 @@ jobs: TYPE="${{ steps.test_type.outputs.type }}" if [ "$TYPE" = "all" ]; then echo "tests_run=true" >> $GITHUB_ENV - docker compose exec trinity-node-1 pytest tests -v -s --ctrf report.json + docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s --ctrf report.json" elif [ "$TYPE" = "diff" ]; then if [ -s ../../../test_dirs.txt ]; then echo "tests_run=true" >> $GITHUB_ENV TEST_DIRS=$(cat ../../../test_dirs.txt | xargs) - docker compose exec trinity-node-1 pytest $TEST_DIRS -v -s --ctrf report.json + docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest $TEST_DIRS -v -s --ctrf report.json" else echo "No changed modules detected, skipping tests." echo "tests_run=false" >> $GITHUB_ENV @@ -90,7 +90,7 @@ jobs: MODULE="${{ steps.test_type.outputs.module }}" if [ -n "$MODULE" ]; then echo "tests_run=true" >> $GITHUB_ENV - docker compose exec trinity-node-1 pytest tests/$MODULE -v -s --ctrf report.json + docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests/$MODULE -v -s --ctrf report.json" else echo "No module specified, skipping tests." echo "tests_run=false" >> $GITHUB_ENV diff --git a/pyproject.toml b/pyproject.toml index f7a8162bfe..b7e3227a0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "trinity-rft" -version = "0.3.3" +version = "0.4.0" authors = [ {name="Trinity-RFT Team", email="trinity-rft@outlook.com"}, ] @@ -73,6 +73,8 @@ dev = [ ] megatron = [ "megatron-core[mlm]==0.13.1", + # if you found "undefined symbol" error in transformer engine + # reinstall it with --no-build-isolation and `--no-cache-dir` flag "transformer_engine[pytorch]==2.8.0", "mbridge>=0.13.0", ] diff --git a/scripts/docker/Dockerfile.uv b/scripts/docker/Dockerfile.uv index 82d5389ada..552685a423 100644 --- a/scripts/docker/Dockerfile.uv +++ b/scripts/docker/Dockerfile.uv @@ -39,8 +39,10 @@ RUN . /opt/venv/bin/activate && \ # Install flash_attn and Megatron RUN . /opt/venv/bin/activate && \ - uv pip install flash_attn==2.8.1 --no-cache-dir && \ - uv pip install -e .[megatron] && \ + uv pip install flash_attn==2.8.1 --no-build-isolation && \ + uv pip install megatron-core[mlm]==0.13.1 && \ + uv pip install transformer_engine[pytorch]==2.8.0 --no-build-isolation && \ + uv pip install mbridge>=0.13.0 && \ NVCC_APPEND_FLAGS="--threads 4" APEX_PARALLEL_BUILD=8 \ uv pip install -v --no-build-isolation \ --config-settings="--build-option=--cpp_ext" \ diff --git a/tests/utils/swanlab_test.py b/tests/utils/swanlab_test.py index 6b7f6a9c1e..c90785b42a 100644 --- a/tests/utils/swanlab_test.py +++ b/tests/utils/swanlab_test.py @@ -5,6 +5,9 @@ class TestSwanlabMonitor(unittest.TestCase): @classmethod def setUpClass(cls): + cls._original_env = { + "SWANLAB_API_KEY": os.environ.get("SWANLAB_API_KEY"), + } os.environ["SWANLAB_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxx" @classmethod @@ -31,7 +34,3 @@ def test_swanlab_monitor_smoke(self): # Log a minimal metric to verify basic flow mon.log({"smoke/metric": 1.0}, step=1) mon.close() - - -if __name__ == "__main__": - unittest.main() diff --git a/trinity/__init__.py b/trinity/__init__.py index 3866a3cd5e..26314a1a39 100644 --- a/trinity/__init__.py +++ b/trinity/__init__.py @@ -1,4 +1,4 @@ # -*- coding: utf-8 -*- """Trinity-RFT (Reinforcement Fine-Tuning)""" -__version__ = "0.3.3" +__version__ = "0.4.0" diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py index c6f85b48f8..60131f13aa 100644 --- a/trinity/common/models/vllm_model.py +++ b/trinity/common/models/vllm_model.py @@ -447,7 +447,7 @@ async def convert_messages_to_experience( if len(token_ids) > self.config.max_model_len - 1: truncate_status = "response_truncated" self.logger.warning( - f"Warning: {len(token_ids) = } exceeds the length limit {self.config.max_model_len-1 = }" + f"Warning: {len(token_ids)=} exceeds the length limit {(self.config.max_model_len - 1)=}" ) token_ids = token_ids[: self.config.max_model_len - 1] action_mask = action_mask[: self.config.max_model_len - 1] From 606658ebc85d38bd4a50ab0d2148d1e98109390a Mon Sep 17 00:00:00 2001 From: pan-x-c Date: Thu, 25 Dec 2025 09:06:36 +0000 Subject: [PATCH 2/7] update python version --- .github/workflows/sphinx-doc.yaml | 4 ++-- README.md | 2 +- README_zh.md | 2 +- docs/sphinx_doc/source_zh/tutorial/trinity_installation.md | 2 +- environments/data.yaml | 2 +- environments/training.yaml | 2 +- scripts/docker/Dockerfile.uv | 5 ++--- 7 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/sphinx-doc.yaml b/.github/workflows/sphinx-doc.yaml index 934d71060b..422d6c9136 100644 --- a/.github/workflows/sphinx-doc.yaml +++ b/.github/workflows/sphinx-doc.yaml @@ -16,10 +16,10 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ['3.10'] + python-version: ['3.12'] env: OS: ${{ matrix.os }} - PYTHON: '3.10' + PYTHON: '3.12' steps: - name: Free up disk space run: | diff --git a/README.md b/README.md index 58526defa1..0df3f8fc80 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ Choose one of the following options: ###### Using Conda ```bash -conda create -n trinity python=3.10 +conda create -n trinity python=3.12 conda activate trinity pip install -e ".[dev]" diff --git a/README_zh.md b/README_zh.md index fd90f4af51..a16063f30d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -182,7 +182,7 @@ cd Trinity-RFT #### 使用 Conda ```bash -conda create -n trinity python=3.10 +conda create -n trinity python=3.12 conda activate trinity pip install -e ".[dev]" diff --git a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md index d337e84960..24b4eefbb2 100644 --- a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md +++ b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md @@ -29,7 +29,7 @@ cd Trinity-RFT #### 使用 Conda ```bash -conda create -n trinity python=3.10 +conda create -n trinity python=3.12 conda activate trinity pip install -e ".[dev]" diff --git a/environments/data.yaml b/environments/data.yaml index d43ece076b..8371cda714 100644 --- a/environments/data.yaml +++ b/environments/data.yaml @@ -2,7 +2,7 @@ name: trinity_data channels: - defaults dependencies: - - python=3.10 + - python=3.12 - pip: - py-data-juicer - agentscope diff --git a/environments/training.yaml b/environments/training.yaml index 436a75e778..70b255292d 100644 --- a/environments/training.yaml +++ b/environments/training.yaml @@ -2,6 +2,6 @@ name: trinity channels: - defaults dependencies: - - python=3.10 + - python=3.12 - pip: - "-e ..[dev]" diff --git a/scripts/docker/Dockerfile.uv b/scripts/docker/Dockerfile.uv index 552685a423..75f6fe12ba 100644 --- a/scripts/docker/Dockerfile.uv +++ b/scripts/docker/Dockerfile.uv @@ -40,9 +40,8 @@ RUN . /opt/venv/bin/activate && \ # Install flash_attn and Megatron RUN . /opt/venv/bin/activate && \ uv pip install flash_attn==2.8.1 --no-build-isolation && \ - uv pip install megatron-core[mlm]==0.13.1 && \ - uv pip install transformer_engine[pytorch]==2.8.0 --no-build-isolation && \ - uv pip install mbridge>=0.13.0 && \ + uv pip install -e .[megatron] && \ + uv pip install --reinstall transformer_engine[pytorch]==2.8.0 --no-build-isolation --no-cache-dir && \ NVCC_APPEND_FLAGS="--threads 4" APEX_PARALLEL_BUILD=8 \ uv pip install -v --no-build-isolation \ --config-settings="--build-option=--cpp_ext" \ From f45e159f925ddcbf859aa206c055db075272924b Mon Sep 17 00:00:00 2001 From: pan-x-c Date: Thu, 25 Dec 2025 09:08:41 +0000 Subject: [PATCH 3/7] update python version --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 97342e6637..91a9702b19 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: rev: 23.7.0 hooks: - id: black - language_version: python3.10 + language_version: python3.12 args: [--line-length=100] - repo: https://github.com/pycqa/isort From 92ee04a41fc8ee4a929d0c64802f1974979cae25 Mon Sep 17 00:00:00 2001 From: pan-x-c Date: Thu, 25 Dec 2025 09:10:59 +0000 Subject: [PATCH 4/7] update python version --- docs/sphinx_doc/source/tutorial/trinity_installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx_doc/source/tutorial/trinity_installation.md b/docs/sphinx_doc/source/tutorial/trinity_installation.md index bd72967556..2e554906d8 100644 --- a/docs/sphinx_doc/source/tutorial/trinity_installation.md +++ b/docs/sphinx_doc/source/tutorial/trinity_installation.md @@ -29,7 +29,7 @@ Choose one of the following options: #### Using Conda ```bash -conda create -n trinity python=3.10 +conda create -n trinity python=3.12 conda activate trinity pip install -e ".[dev]" From 5633610cfc42d69bf261346582f46ccc80c919be Mon Sep 17 00:00:00 2001 From: pan-x-c Date: Thu, 25 Dec 2025 10:09:37 +0000 Subject: [PATCH 5/7] update docker compose --- .github/workflows/docker/docker-compose.yaml | 4 ++-- .github/workflows/unittest.yaml | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml index 08b8f45d8a..1a931597e8 100644 --- a/.github/workflows/docker/docker-compose.yaml +++ b/.github/workflows/docker/docker-compose.yaml @@ -2,7 +2,7 @@ services: trinity-node-1: image: trinity-rft-unittest:20251225 pull_policy: never - command: sh -c "pip install -e .[dev] && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block" + command: bash -c "source /opt/venv/bin/activate && uv pip install -e .[dev] && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block" environment: - HF_ENDPOINT=https://hf-mirror.com - RAY_ADDRESS=auto @@ -31,7 +31,7 @@ services: trinity-node-2: image: trinity-rft-unittest:20251225 pull_policy: never - command: sh -c "pip install -e .[dev] && ray start --address=trinity-node-1:6379 --block" + command: bash -c "source /opt/venv/bin/activate && uv pip install -e .[dev] && ray start --address=trinity-node-1:6379 --block" environment: - HF_ENDPOINT=https://hf-mirror.com - TRINITY_CHECKPOINT_ROOT_DIR=/mnt/checkpoints diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 9632233b97..702145b14e 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -34,7 +34,10 @@ jobs: MAX_RETRIES=20 RETRY_INTERVAL=5 for i in $(seq 1 $MAX_RETRIES); do - docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && ray status" && docker compose exec trinity-node-2 bash -c "source /opt/venv/bin/activate && ray status" && break + if docker compose exec trinity-node-test-1 bash -c "source /opt/venv/bin/activate && ray status" \ + && docker compose exec trinity-node-test-2 bash -c "source /opt/venv/bin/activate && ray status"; then + break + fi echo "Waiting for ray cluster to be ready... ($i/$MAX_RETRIES)" sleep $RETRY_INTERVAL if [ "$i" -eq "$MAX_RETRIES" ]; then From 14440436f62d11b8a2af912f7ebba9b2750e272d Mon Sep 17 00:00:00 2001 From: pan-x-c Date: Thu, 25 Dec 2025 10:15:23 +0000 Subject: [PATCH 6/7] update docker compose --- .github/workflows/docker/docker-compose.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml index 1a931597e8..746c1c9aa8 100644 --- a/.github/workflows/docker/docker-compose.yaml +++ b/.github/workflows/docker/docker-compose.yaml @@ -13,6 +13,7 @@ services: - TRINITY_MODEL_PATH=/mnt/models/Qwen3-0.6B - TRINITY_API_MODEL_PATH=/mnt/models/Qwen3-1.7B - TRINITY_VLM_MODEL_PATH=/mnt/models/Qwen2.5-VL-3B + - VIRTUAL_ENV=/opt/venv working_dir: /workspace networks: - trinity-network @@ -37,6 +38,7 @@ services: - TRINITY_CHECKPOINT_ROOT_DIR=/mnt/checkpoints - TRINITY_TASKSET_PATH=/mnt/data - TRINITY_MODEL_PATH=/mnt/models/Qwen3-1.7B + - VIRTUAL_ENV=/opt/venv working_dir: /workspace volumes: - trinity-volume:/mnt From 074eecc4fda37eaa828e1b83088f10f57a507519 Mon Sep 17 00:00:00 2001 From: pan-x-c Date: Thu, 25 Dec 2025 10:52:21 +0000 Subject: [PATCH 7/7] fix unittest --- .github/workflows/unittest.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 702145b14e..89b6429aa6 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -34,8 +34,8 @@ jobs: MAX_RETRIES=20 RETRY_INTERVAL=5 for i in $(seq 1 $MAX_RETRIES); do - if docker compose exec trinity-node-test-1 bash -c "source /opt/venv/bin/activate && ray status" \ - && docker compose exec trinity-node-test-2 bash -c "source /opt/venv/bin/activate && ray status"; then + if docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && ray status" \ + && docker compose exec trinity-node-2 bash -c "source /opt/venv/bin/activate && ray status"; then break fi echo "Waiting for ray cluster to be ready... ($i/$MAX_RETRIES)"