Skip to content

tpch: fix Parquet streaming + align batch size with tpcds #103

tpch: fix Parquet streaming + align batch size with tpcds

tpch: fix Parquet streaming + align batch size with tpcds #103

Workflow file for this run

name: CI - Build & Benchmark
on:
push:
branches:
- master
- develop
pull_request:
branches:
- master
- develop
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
# Resolve the best available Docker image tag for each config.
# Prefers a branch-specific tag (e.g. tsafin-lance_stream) over :latest so
# that PRs with updated lance-ffi/third_party sources automatically use the
# matching pre-compiled image built by the docker-images workflow.
resolve-images:
name: Resolve Docker image tags
runs-on: ubuntu-22.04
outputs:
base_image: ${{ steps.resolve.outputs.base_image }}
orc_image: ${{ steps.resolve.outputs.orc_image }}
lance_image: ${{ steps.resolve.outputs.lance_image }}
steps:
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Resolve image tags
id: resolve
run: |
IMAGE_PREFIX="ghcr.io/${{ github.repository_owner }}/tpch-cpp"
# Sanitize branch name the same way docker/metadata-action does:
# type=ref,event=branch replaces '/' with '-' and lowercases, keeps underscores
BRANCH="${{ github.head_ref || github.ref_name }}"
BRANCH_TAG=$(echo "$BRANCH" | tr '/' '-' | tr '[:upper:]' '[:lower:]')
echo "Branch: $BRANCH → tag: $BRANCH_TAG"
for CONFIG in base orc lance; do
BRANCH_IMAGE="${IMAGE_PREFIX}-${CONFIG}:${BRANCH_TAG}"
LATEST_IMAGE="${IMAGE_PREFIX}-${CONFIG}:latest"
# Try to pull the branch-specific image; fall back to :latest
if docker manifest inspect "$BRANCH_IMAGE" > /dev/null 2>&1; then
echo "Using branch image: $BRANCH_IMAGE"
echo "${CONFIG}_image=$BRANCH_IMAGE" >> $GITHUB_OUTPUT
else
echo "Branch image not found, falling back to: $LATEST_IMAGE"
echo "${CONFIG}_image=$LATEST_IMAGE" >> $GITHUB_OUTPUT
fi
done
build-matrix:
name: Build (${{ matrix.config }})
runs-on: ubuntu-22.04
needs: resolve-images
timeout-minutes: 20
container:
image: ${{ matrix.config == 'base' && needs.resolve-images.outputs.base_image || matrix.config == 'orc' && needs.resolve-images.outputs.orc_image || needs.resolve-images.outputs.lance_image }}
options: --user root
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
strategy:
fail-fast: false
matrix:
include:
- config: base
enable_orc: OFF
enable_lance: OFF
enable_tests: ON
deps_path: /opt/dependencies
- config: orc
enable_orc: ON
enable_lance: OFF
enable_tests: ON
deps_path: /opt/dependencies
- config: lance
enable_orc: OFF
enable_lance: ON
enable_tests: ON
deps_path: /opt/dependencies
steps:
- name: Checkout code with submodules
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 1
- name: Verify pre-compiled dependencies
run: |
echo "=== Checking pre-compiled dependencies in Docker image ==="
ls -lh /opt/dependencies/lib/libarrow* /opt/dependencies/lib/libparquet* || true
if [ "${{ matrix.config }}" = "orc" ]; then
ls -lh /opt/dependencies/lib/liborc* || true
fi
if [ "${{ matrix.config }}" = "lance" ]; then
ls -lh /opt/dependencies/lib/liblance_ffi.a || true
rustc --version
cargo --version
fi
- name: Configure CMake
run: |
cmake -B build \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_PREFIX_PATH=${{ matrix.deps_path }} \
-DTPCH_ENABLE_ORC=${{ matrix.enable_orc }} \
-DTPCH_ENABLE_LANCE=${{ matrix.enable_lance }} \
-DTPCH_ENABLE_NATIVE_OPTIMIZATIONS=OFF \
-DTPCH_ENABLE_ASYNC_IO=ON \
-DTPCH_ENABLE_ASAN=OFF \
-DTPCH_BUILD_TESTS=${{ matrix.enable_tests }} \
-DTPCDS_ENABLE=ON
- name: Build project
run: cmake --build build -j$(nproc)
- name: Verify executable and tests
run: |
test -f build/tpch_benchmark && echo "✓ tpch_benchmark created"
test -f build/tpcds_benchmark && echo "✓ tpcds_benchmark created"
test -f build/tests/buffer_lifetime_manager_test && echo "✓ buffer_lifetime_manager_test created" || true
test -f build/tests/dbgen_batch_iterator_test && echo "✓ dbgen_batch_iterator_test created" || true
if [ "${{ matrix.enable_lance }}" = "ON" ]; then
test -f build/tests/lance_writer_test && echo "✓ lance_writer_test created" || echo "✗ lance_writer_test missing"
fi
- name: Run unit tests
run: |
# Define which tests to run for each configuration
case "${{ matrix.config }}" in
base)
# Run common/core tests only in base build
TESTS="buffer_lifetime_manager_test dbgen_batch_iterator_test"
;;
orc)
# ORC doesn't have format-specific tests yet, skip to avoid redundancy
TESTS=""
echo "No ORC-specific tests to run (common tests run in base config)"
;;
lance)
# Run Lance-specific tests only
TESTS="lance_writer_test"
;;
*)
echo "Unknown config: ${{ matrix.config }}"
exit 1
;;
esac
# Run the specified tests from build directory (where dists.dss exists)
cd build
for test_name in $TESTS; do
if [ -x "tests/$test_name" ]; then
echo "============================"
echo "Running: $test_name"
echo "============================"
"./tests/$test_name" --gtest_output=xml:"tests/${test_name}_results.xml" || exit 1
else
echo "ERROR: Expected test executable not found: tests/$test_name"
exit 1
fi
done
- name: Upload build artifact
uses: actions/upload-artifact@v4
with:
name: tpch-benchmark-${{ matrix.config }}
path: |
build/tpch_benchmark
build/tpcds_benchmark
build/tests/*_test
retention-days: 1
if-no-files-found: error
tpch-benchmark-suite:
name: TPC-H Benchmark Suite
runs-on: ubuntu-22.04
needs: [resolve-images, build-matrix]
timeout-minutes: 20
container:
image: ${{ matrix.build == 'base' && needs.resolve-images.outputs.base_image || matrix.build == 'orc' && needs.resolve-images.outputs.orc_image || needs.resolve-images.outputs.lance_image }}
options: --user root
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
strategy:
fail-fast: false
matrix:
include:
# CSV format - all tables
- format: csv
table: lineitem
build: base
- format: csv
table: orders
build: base
- format: csv
table: customer
build: base
- format: csv
table: part
build: base
- format: csv
table: partsupp
build: base
- format: csv
table: supplier
build: base
- format: csv
table: nation
build: base
- format: csv
table: region
build: base
# Parquet format - all tables
- format: parquet
table: lineitem
build: base
- format: parquet
table: orders
build: base
- format: parquet
table: customer
build: base
- format: parquet
table: part
build: base
- format: parquet
table: partsupp
build: base
- format: parquet
table: supplier
build: base
- format: parquet
table: nation
build: base
- format: parquet
table: region
build: base
# ORC format - lineitem, customer, orders
- format: orc
table: lineitem
build: orc
- format: orc
table: customer
build: orc
- format: orc
table: orders
build: orc
# Lance format - lineitem, customer, orders
- format: lance
table: lineitem
build: lance
- format: lance
table: customer
build: lance
- format: lance
table: orders
build: lance
steps:
- name: Checkout code (for dists.dss and scripts)
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Checkout tpch submodule (for dists.dss)
run: |
git config --global --add safe.directory "$GITHUB_WORKSPACE"
git submodule update --init --depth 1 -- third_party/tpch
- name: Download build artifact
uses: actions/download-artifact@v4
with:
name: tpch-benchmark-${{ matrix.build }}
path: .
- name: Setup benchmark executable
run: |
chmod +x tpch_benchmark
mkdir -p benchmark-results
export LD_LIBRARY_PATH=/opt/dependencies/lib:$LD_LIBRARY_PATH
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Verify executable supports required format
run: |
echo "=== Verifying build artifact ==="
echo "Expected format: ${{ matrix.format }}"
echo "Expected build: ${{ matrix.build }}"
./tpch_benchmark --help | head -20 || true
- name: Run format coverage benchmark
run: |
# Copy dists.dss to current directory (required by dbgen)
cp third_party/tpch/dbgen/dists.dss . 2>/dev/null || true
if ! timeout 600 ./tpch_benchmark \
--use-dbgen \
--scale-factor 1 \
--format ${{ matrix.format }} \
--table ${{ matrix.table }} \
--output-dir benchmark-results/ \
2>&1 | grep -v "^DEBUG:" | tee "benchmark-results/${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then
echo "ERROR: Benchmark failed with exit code $?"
exit 1
fi
# Fail if process dumped core
if grep -q "dumped core" "benchmark-results/${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then
echo "ERROR: Benchmark crashed with core dump"
exit 1
fi
# Fail if unsupported format
if grep -qi "unknown format\|unsupported format\|not supported" "benchmark-results/${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then
echo "ERROR: Format ${{ matrix.format }} not supported by this build"
exit 1
fi
- name: Upload benchmark logs
if: always()
uses: actions/upload-artifact@v4
with:
name: tpch-benchmark-logs-suite-${{ matrix.format }}-${{ matrix.table }}
path: benchmark-results/${{ matrix.format }}_${{ matrix.table }}_baseline.log
retention-days: 30
if-no-files-found: ignore
tpch-optimization-benchmarks:
name: TPC-H Optimization Benchmarks (${{ matrix.format }}-${{ matrix.mode }})
runs-on: ubuntu-22.04
needs: [resolve-images, build-matrix]
timeout-minutes: 20
container:
image: ${{ matrix.image == 'base' && needs.resolve-images.outputs.base_image || matrix.image == 'orc' && needs.resolve-images.outputs.orc_image || needs.resolve-images.outputs.lance_image }}
options: --user root
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
strategy:
fail-fast: false
matrix:
include:
# Parquet benchmarks use base image
- format: parquet
mode: baseline
table: lineitem
image: base
- format: parquet
mode: baseline
table: orders
image: base
- format: parquet
mode: baseline
table: part
image: base
- format: parquet
mode: zero-copy
table: lineitem
image: base
- format: parquet
mode: zero-copy
table: orders
image: base
- format: parquet
mode: zero-copy
table: part
image: base
- format: parquet
mode: true-zero-copy
table: lineitem
image: base
- format: parquet
mode: true-zero-copy
table: orders
image: base
- format: parquet
mode: true-zero-copy
table: part
image: base
# ORC benchmarks use orc image
- format: orc
mode: baseline
table: lineitem
image: orc
- format: orc
mode: baseline
table: orders
image: orc
- format: orc
mode: baseline
table: part
image: orc
- format: orc
mode: zero-copy
table: lineitem
image: orc
- format: orc
mode: zero-copy
table: orders
image: orc
- format: orc
mode: zero-copy
table: part
image: orc
- format: orc
mode: true-zero-copy
table: lineitem
image: orc
- format: orc
mode: true-zero-copy
table: orders
image: orc
- format: orc
mode: true-zero-copy
table: part
image: orc
# Lance benchmarks use lance image
- format: lance
mode: baseline
table: lineitem
image: lance
- format: lance
mode: baseline
table: orders
image: lance
- format: lance
mode: baseline
table: part
image: lance
- format: lance
mode: zero-copy
table: lineitem
image: lance
- format: lance
mode: zero-copy
table: orders
image: lance
- format: lance
mode: zero-copy
table: part
image: lance
- format: lance
mode: true-zero-copy
table: lineitem
image: lance
- format: lance
mode: true-zero-copy
table: orders
image: lance
- format: lance
mode: true-zero-copy
table: part
image: lance
steps:
- name: Checkout code (for dists.dss and scripts)
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Checkout tpch submodule (for dists.dss)
run: |
git config --global --add safe.directory "$GITHUB_WORKSPACE"
git submodule update --init --depth 1 -- third_party/tpch
- name: Download build artifact
uses: actions/download-artifact@v4
with:
name: tpch-benchmark-${{ matrix.image }}
path: .
- name: Setup benchmark executable
run: |
chmod +x tpch_benchmark
mkdir -p benchmark-results
export LD_LIBRARY_PATH=/opt/dependencies/lib:$LD_LIBRARY_PATH
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Verify executable supports required format
run: |
echo "=== Verifying build artifact ==="
echo "Expected format: ${{ matrix.format }}"
./tpch_benchmark --help | head -20 || true
- name: Run optimization benchmark
run: |
# Copy dists.dss to current directory (required by dbgen)
cp third_party/tpch/dbgen/dists.dss . 2>/dev/null || true
MODE_FLAGS=""
if [ "${{ matrix.mode }}" = "zero-copy" ]; then
MODE_FLAGS="--zero-copy"
elif [ "${{ matrix.mode }}" = "true-zero-copy" ]; then
MODE_FLAGS="--true-zero-copy"
fi
if ! timeout 600 ./tpch_benchmark \
--use-dbgen \
--scale-factor 1 \
--format ${{ matrix.format }} \
--table ${{ matrix.table }} \
--output-dir benchmark-results/ \
$MODE_FLAGS \
2>&1 | grep -v "^DEBUG:" | tee "benchmark-results/${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then
echo "ERROR: Benchmark failed with exit code $?"
exit 1
fi
# Fail if process dumped core
if grep -q "dumped core" "benchmark-results/${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then
echo "ERROR: Benchmark crashed with core dump"
exit 1
fi
# Fail if unsupported format
if grep -qi "unknown format\|unsupported format\|not supported" "benchmark-results/${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then
echo "ERROR: Format ${{ matrix.format }} not supported by this build"
exit 1
fi
- name: Upload benchmark logs
if: always()
uses: actions/upload-artifact@v4
with:
name: tpch-benchmark-logs-optimization-${{ matrix.format }}-${{ matrix.mode }}-${{ matrix.table }}
path: benchmark-results/${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log
retention-days: 30
if-no-files-found: ignore
tpcds-benchmark-suite:
name: TPC-DS Benchmark Suite
runs-on: ubuntu-22.04
needs: [resolve-images, build-matrix]
timeout-minutes: 20
container:
image: ${{ matrix.build == 'base' && needs.resolve-images.outputs.base_image || matrix.build == 'orc' && needs.resolve-images.outputs.orc_image || needs.resolve-images.outputs.lance_image }}
options: --user root
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
strategy:
fail-fast: false
matrix:
include:
# CSV format
- format: csv
table: store_returns
build: base
- format: csv
table: store_sales
build: base
- format: csv
table: customer
build: base
- format: csv
table: item
build: base
# Parquet format
- format: parquet
table: store_returns
build: base
- format: parquet
table: store_sales
build: base
- format: parquet
table: customer
build: base
- format: parquet
table: item
build: base
# ORC format
- format: orc
table: store_returns
build: orc
- format: orc
table: store_sales
build: orc
# Lance format
- format: lance
table: store_returns
build: lance
- format: lance
table: store_sales
build: lance
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Download build artifact
uses: actions/download-artifact@v4
with:
name: tpch-benchmark-${{ matrix.build }}
path: .
- name: Setup benchmark executable
run: |
chmod +x tpcds_benchmark
mkdir -p benchmark-results
export LD_LIBRARY_PATH=/opt/dependencies/lib:$LD_LIBRARY_PATH
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Run format coverage benchmark
run: |
if ! timeout 600 ./tpcds_benchmark \
--scale-factor 1 \
--format ${{ matrix.format }} \
--table ${{ matrix.table }} \
--output-dir benchmark-results/ \
2>&1 | grep -v "^DEBUG:" | tee "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then
echo "ERROR: Benchmark failed with exit code $?"
exit 1
fi
if grep -q "dumped core" "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then
echo "ERROR: Benchmark crashed with core dump"
exit 1
fi
if grep -qi "unknown format\|unsupported format\|not supported" "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then
echo "ERROR: Format ${{ matrix.format }} not supported by this build"
exit 1
fi
- name: Upload benchmark logs
if: always()
uses: actions/upload-artifact@v4
with:
name: tpcds-benchmark-logs-suite-${{ matrix.format }}-${{ matrix.table }}
path: benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_baseline.log
retention-days: 30
if-no-files-found: ignore
tpcds-optimization-benchmarks:
name: TPC-DS Optimization Benchmarks (${{ matrix.format }}-${{ matrix.mode }})
runs-on: ubuntu-22.04
needs: [resolve-images, build-matrix]
timeout-minutes: 20
container:
image: ${{ matrix.image == 'base' && needs.resolve-images.outputs.base_image || needs.resolve-images.outputs.lance_image }}
options: --user root
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
strategy:
fail-fast: false
matrix:
include:
# Parquet benchmarks
- format: parquet
mode: baseline
table: store_returns
image: base
- format: parquet
mode: baseline
table: store_sales
image: base
- format: parquet
mode: zero-copy
table: store_returns
image: base
- format: parquet
mode: zero-copy
table: store_sales
image: base
# Lance benchmarks
- format: lance
mode: baseline
table: store_returns
image: lance
- format: lance
mode: baseline
table: store_sales
image: lance
- format: lance
mode: zero-copy
table: store_returns
image: lance
- format: lance
mode: zero-copy
table: store_sales
image: lance
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Download build artifact
uses: actions/download-artifact@v4
with:
name: tpch-benchmark-${{ matrix.image }}
path: .
- name: Setup benchmark executable
run: |
chmod +x tpcds_benchmark
mkdir -p benchmark-results
export LD_LIBRARY_PATH=/opt/dependencies/lib:$LD_LIBRARY_PATH
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Run optimization benchmark
run: |
MODE_FLAGS=""
if [ "${{ matrix.mode }}" = "zero-copy" ]; then
MODE_FLAGS="--zero-copy"
fi
if ! timeout 600 ./tpcds_benchmark \
--scale-factor 1 \
--format ${{ matrix.format }} \
--table ${{ matrix.table }} \
--output-dir benchmark-results/ \
$MODE_FLAGS \
2>&1 | grep -v "^DEBUG:" | tee "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then
echo "ERROR: Benchmark failed with exit code $?"
exit 1
fi
if grep -q "dumped core" "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then
echo "ERROR: Benchmark crashed with core dump"
exit 1
fi
if grep -qi "unknown format\|unsupported format\|not supported" "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then
echo "ERROR: Format ${{ matrix.format }} not supported by this build"
exit 1
fi
- name: Upload benchmark logs
if: always()
uses: actions/upload-artifact@v4
with:
name: tpcds-benchmark-logs-optimization-${{ matrix.format }}-${{ matrix.mode }}-${{ matrix.table }}
path: benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log
retention-days: 30
if-no-files-found: ignore
results-aggregation:
name: Aggregate Results
runs-on: ubuntu-22.04
needs: [tpch-benchmark-suite, tpch-optimization-benchmarks, tpcds-benchmark-suite, tpcds-optimization-benchmarks]
if: always()
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download all benchmark artifacts
uses: actions/download-artifact@v4
with:
path: all-results
- name: Prepare benchmark results directory
run: |
mkdir -p benchmark-results
find all-results -name "*.log" -exec cp {} benchmark-results/ \;
- name: Generate summary report
run: |
python3 scripts/parse_benchmark_logs.py benchmark-results > benchmark-results/ci_summary.json || true
- name: Generate HTML visualization
if: always()
run: |
if [ -f benchmark-results/ci_summary.json ]; then
python3 scripts/visualize_benchmark_results.py benchmark-results/ci_summary.json benchmark-results/report.html
else
echo "No summary JSON found, skipping visualization"
fi
- name: Upload aggregated results
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: |
benchmark-results/*.log
benchmark-results/*.json
benchmark-results/*.html
retention-days: 30
if-no-files-found: ignore
- name: Print summary
if: always()
run: |
if [ -f benchmark-results/ci_summary.json ]; then
echo "=== Benchmark Summary ==="
python3 -m json.tool benchmark-results/ci_summary.json || cat benchmark-results/ci_summary.json
else
echo "No summary generated (logs may not exist yet)"
fi
status-check:
name: Status Check
runs-on: ubuntu-22.04
needs: [build-matrix]
if: always()
steps:
- name: Check build status
run: |
if [ "${{ needs.build-matrix.result }}" = "success" ]; then
echo "✓ All builds and tests passed"
exit 0
else
echo "✗ Some builds or tests failed"
echo "Build status: ${{ needs.build-matrix.result }}"
exit 1
fi