diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index eb8c15c323..4826a0ab6e 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -21,7 +21,7 @@ env: jobs: build-and-push-image: runs-on: self-hosted - timeout-minutes: 240 # wait up to 4 hours + timeout-minutes: 480 # wait up to 8 hours # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. permissions: contents: read @@ -61,8 +61,8 @@ jobs: with: context: trinity-${{ github.run_id }} push: true - file: trinity-${{ github.run_id }}/scripts/docker/Dockerfile - shm-size: 64g + file: trinity-${{ github.run_id }}/scripts/docker/Dockerfile.uv + shm-size: 128g tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml index b6756f659a..746c1c9aa8 100644 --- a/.github/workflows/docker/docker-compose.yaml +++ b/.github/workflows/docker/docker-compose.yaml @@ -1,8 +1,8 @@ services: trinity-node-1: - image: trinity-rft-unittest:20251030 + image: trinity-rft-unittest:20251225 pull_policy: never - command: sh -c "pip install -e .[dev] && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block" + command: bash -c "source /opt/venv/bin/activate && uv pip install -e .[dev] && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block" environment: - HF_ENDPOINT=https://hf-mirror.com - RAY_ADDRESS=auto @@ -13,6 +13,7 @@ services: - TRINITY_MODEL_PATH=/mnt/models/Qwen3-0.6B - TRINITY_API_MODEL_PATH=/mnt/models/Qwen3-1.7B - TRINITY_VLM_MODEL_PATH=/mnt/models/Qwen2.5-VL-3B + - VIRTUAL_ENV=/opt/venv working_dir: /workspace networks: - trinity-network @@ -29,14 +30,15 @@ services: capabilities: [gpu] trinity-node-2: - image: trinity-rft-unittest:20251030 + image: trinity-rft-unittest:20251225 pull_policy: never - command: sh -c "pip install -e .[dev] && ray start --address=trinity-node-1:6379 --block" + command: bash -c "source /opt/venv/bin/activate && uv pip install -e .[dev] && ray start --address=trinity-node-1:6379 --block" environment: - HF_ENDPOINT=https://hf-mirror.com - TRINITY_CHECKPOINT_ROOT_DIR=/mnt/checkpoints - TRINITY_TASKSET_PATH=/mnt/data - TRINITY_MODEL_PATH=/mnt/models/Qwen3-1.7B + - VIRTUAL_ENV=/opt/venv working_dir: /workspace volumes: - trinity-volume:/mnt diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml index af12d99902..42a88f413d 100644 --- a/.github/workflows/pre-commit.yaml +++ b/.github/workflows/pre-commit.yaml @@ -9,5 +9,5 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.12' - uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/sphinx-doc.yaml b/.github/workflows/sphinx-doc.yaml index 934d71060b..422d6c9136 100644 --- a/.github/workflows/sphinx-doc.yaml +++ b/.github/workflows/sphinx-doc.yaml @@ -16,10 +16,10 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ['3.10'] + python-version: ['3.12'] env: OS: ${{ matrix.os }} - PYTHON: '3.10' + PYTHON: '3.12' steps: - name: Free up disk space run: | diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 5f1db13eae..89b6429aa6 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -34,7 +34,10 @@ jobs: MAX_RETRIES=20 RETRY_INTERVAL=5 for i in $(seq 1 $MAX_RETRIES); do - docker compose exec trinity-node-1 ray status && docker compose exec trinity-node-2 ray status && break + if docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && ray status" \ + && docker compose exec trinity-node-2 bash -c "source /opt/venv/bin/activate && ray status"; then + break + fi echo "Waiting for ray cluster to be ready... ($i/$MAX_RETRIES)" sleep $RETRY_INTERVAL if [ "$i" -eq "$MAX_RETRIES" ]; then @@ -76,12 +79,12 @@ jobs: TYPE="${{ steps.test_type.outputs.type }}" if [ "$TYPE" = "all" ]; then echo "tests_run=true" >> $GITHUB_ENV - docker compose exec trinity-node-1 pytest tests -v -s --ctrf report.json + docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s --ctrf report.json" elif [ "$TYPE" = "diff" ]; then if [ -s ../../../test_dirs.txt ]; then echo "tests_run=true" >> $GITHUB_ENV TEST_DIRS=$(cat ../../../test_dirs.txt | xargs) - docker compose exec trinity-node-1 pytest $TEST_DIRS -v -s --ctrf report.json + docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest $TEST_DIRS -v -s --ctrf report.json" else echo "No changed modules detected, skipping tests." echo "tests_run=false" >> $GITHUB_ENV @@ -90,7 +93,7 @@ jobs: MODULE="${{ steps.test_type.outputs.module }}" if [ -n "$MODULE" ]; then echo "tests_run=true" >> $GITHUB_ENV - docker compose exec trinity-node-1 pytest tests/$MODULE -v -s --ctrf report.json + docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests/$MODULE -v -s --ctrf report.json" else echo "No module specified, skipping tests." echo "tests_run=false" >> $GITHUB_ENV diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 97342e6637..91a9702b19 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: rev: 23.7.0 hooks: - id: black - language_version: python3.10 + language_version: python3.12 args: [--line-length=100] - repo: https://github.com/pycqa/isort diff --git a/README.md b/README.md index 58526defa1..0df3f8fc80 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ Choose one of the following options: ###### Using Conda ```bash -conda create -n trinity python=3.10 +conda create -n trinity python=3.12 conda activate trinity pip install -e ".[dev]" diff --git a/README_zh.md b/README_zh.md index fd90f4af51..a16063f30d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -182,7 +182,7 @@ cd Trinity-RFT #### 使用 Conda ```bash -conda create -n trinity python=3.10 +conda create -n trinity python=3.12 conda activate trinity pip install -e ".[dev]" diff --git a/docs/sphinx_doc/source/tutorial/trinity_installation.md b/docs/sphinx_doc/source/tutorial/trinity_installation.md index bd72967556..2e554906d8 100644 --- a/docs/sphinx_doc/source/tutorial/trinity_installation.md +++ b/docs/sphinx_doc/source/tutorial/trinity_installation.md @@ -29,7 +29,7 @@ Choose one of the following options: #### Using Conda ```bash -conda create -n trinity python=3.10 +conda create -n trinity python=3.12 conda activate trinity pip install -e ".[dev]" diff --git a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md index d337e84960..24b4eefbb2 100644 --- a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md +++ b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md @@ -29,7 +29,7 @@ cd Trinity-RFT #### 使用 Conda ```bash -conda create -n trinity python=3.10 +conda create -n trinity python=3.12 conda activate trinity pip install -e ".[dev]" diff --git a/environments/data.yaml b/environments/data.yaml index d43ece076b..8371cda714 100644 --- a/environments/data.yaml +++ b/environments/data.yaml @@ -2,7 +2,7 @@ name: trinity_data channels: - defaults dependencies: - - python=3.10 + - python=3.12 - pip: - py-data-juicer - agentscope diff --git a/environments/training.yaml b/environments/training.yaml index 436a75e778..70b255292d 100644 --- a/environments/training.yaml +++ b/environments/training.yaml @@ -2,6 +2,6 @@ name: trinity channels: - defaults dependencies: - - python=3.10 + - python=3.12 - pip: - "-e ..[dev]" diff --git a/pyproject.toml b/pyproject.toml index f7a8162bfe..b7e3227a0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "trinity-rft" -version = "0.3.3" +version = "0.4.0" authors = [ {name="Trinity-RFT Team", email="trinity-rft@outlook.com"}, ] @@ -73,6 +73,8 @@ dev = [ ] megatron = [ "megatron-core[mlm]==0.13.1", + # if you found "undefined symbol" error in transformer engine + # reinstall it with --no-build-isolation and `--no-cache-dir` flag "transformer_engine[pytorch]==2.8.0", "mbridge>=0.13.0", ] diff --git a/scripts/docker/Dockerfile.uv b/scripts/docker/Dockerfile.uv index 82d5389ada..75f6fe12ba 100644 --- a/scripts/docker/Dockerfile.uv +++ b/scripts/docker/Dockerfile.uv @@ -39,8 +39,9 @@ RUN . /opt/venv/bin/activate && \ # Install flash_attn and Megatron RUN . /opt/venv/bin/activate && \ - uv pip install flash_attn==2.8.1 --no-cache-dir && \ + uv pip install flash_attn==2.8.1 --no-build-isolation && \ uv pip install -e .[megatron] && \ + uv pip install --reinstall transformer_engine[pytorch]==2.8.0 --no-build-isolation --no-cache-dir && \ NVCC_APPEND_FLAGS="--threads 4" APEX_PARALLEL_BUILD=8 \ uv pip install -v --no-build-isolation \ --config-settings="--build-option=--cpp_ext" \ diff --git a/tests/utils/swanlab_test.py b/tests/utils/swanlab_test.py index 6b7f6a9c1e..c90785b42a 100644 --- a/tests/utils/swanlab_test.py +++ b/tests/utils/swanlab_test.py @@ -5,6 +5,9 @@ class TestSwanlabMonitor(unittest.TestCase): @classmethod def setUpClass(cls): + cls._original_env = { + "SWANLAB_API_KEY": os.environ.get("SWANLAB_API_KEY"), + } os.environ["SWANLAB_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxx" @classmethod @@ -31,7 +34,3 @@ def test_swanlab_monitor_smoke(self): # Log a minimal metric to verify basic flow mon.log({"smoke/metric": 1.0}, step=1) mon.close() - - -if __name__ == "__main__": - unittest.main() diff --git a/trinity/__init__.py b/trinity/__init__.py index 3866a3cd5e..26314a1a39 100644 --- a/trinity/__init__.py +++ b/trinity/__init__.py @@ -1,4 +1,4 @@ # -*- coding: utf-8 -*- """Trinity-RFT (Reinforcement Fine-Tuning)""" -__version__ = "0.3.3" +__version__ = "0.4.0" diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py index c6f85b48f8..60131f13aa 100644 --- a/trinity/common/models/vllm_model.py +++ b/trinity/common/models/vllm_model.py @@ -447,7 +447,7 @@ async def convert_messages_to_experience( if len(token_ids) > self.config.max_model_len - 1: truncate_status = "response_truncated" self.logger.warning( - f"Warning: {len(token_ids) = } exceeds the length limit {self.config.max_model_len-1 = }" + f"Warning: {len(token_ids)=} exceeds the length limit {(self.config.max_model_len - 1)=}" ) token_ids = token_ids[: self.config.max_model_len - 1] action_mask = action_mask[: self.config.max_model_len - 1]