Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0502b48
feat: Use llama-cpp-server to allow batching tasks
marcelklehr Jun 17, 2026
baa6bf8
fix: Fix health check and set batch size correctly
marcelklehr Jun 17, 2026
81922de
feat: Make processing async
marcelklehr Jun 18, 2026
1f67ac7
feat: Migrate from llama-cpp-python to xllamacpp
marcelklehr Jun 22, 2026
a46be66
fix: Do not set tasks to running if they're only still queued
marcelklehr Jun 22, 2026
99665fb
fix: Improve xllamacpp error handling
marcelklehr Jun 22, 2026
975a562
feat: Provide reasoning content for chat task types
marcelklehr Jun 22, 2026
307639f
chore: Update poetry lock file
marcelklehr Jun 22, 2026
38dd466
fix(ci): Use latest app_api version
marcelklehr Jun 22, 2026
eadb77e
fix: Catch errors in the task loop and refresh processors upon init
marcelklehr Jun 23, 2026
758c18c
fix(ci): Make sure persistent_storage dir exists
marcelklehr Jun 24, 2026
b8eb2cf
fix(ci): Use the xllamacpp cpu wheel in CI
marcelklehr Jun 24, 2026
581ff68
feat: Add cuda and rocm docker builds
marcelklehr Jun 24, 2026
c7c3043
fix(ci): Install cpu build of xllamacpp correctly
marcelklehr Jun 24, 2026
d6e5c2f
fix(ci): Also test on stable34
marcelklehr Jun 24, 2026
ba1be60
fix(ci): Upgrade python
marcelklehr Jun 24, 2026
2fd471b
feat: Add reasoning output to all processors
marcelklehr Jun 24, 2026
9e7c153
feat: Switch to Olmo 3 Think
marcelklehr Jun 24, 2026
5cddedd
fix: Switch back to Olmo 3 Instruct and fix chat template
marcelklehr Jun 25, 2026
715449e
fix: Improve tool parser support for Olmo and Qwen 3.5
marcelklehr Jun 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions .github/workflows/integration_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
matrix:
php-versions: [ '8.3' ]
databases: [ 'sqlite' ]
server-versions: [ 'master', 'stable33', 'stable32', 'stable31', 'stable30' ]
server-versions: [ 'master', 'stable34', 'stable33', 'stable32', 'stable31', 'stable30' ]

name: Integration test on ☁️${{ matrix.server-versions }} 🐘${{ matrix.php-versions }}

Expand Down Expand Up @@ -106,13 +106,21 @@ jobs:
./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$PGSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password
composer run serve &

- name: Checkout app_api
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
with:
path: apps/app_api/
repository: nextcloud/app_api
ref: ${{ matrix.server-versions == 'master' && 'main' || matrix.server-versions }}
persist-credentials: false

- name: Enable app and app_api
run: ./occ app:enable -vvv -f app_api

- name: Setup python 3.10
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.10'
python-version: '3.13'
cache: 'pip'
cache-dependency-path: context_chat_backend/requirements.txt

Expand All @@ -122,6 +130,9 @@ jobs:
sudo apt install pipx
pipx install poetry
poetry install
# pyproject.toml pins xllamacpp to the cu128 wheel, which needs libcuda.so.1 at import
# time. The CI runner has no NVIDIA driver, so we swap in the PyPI CPU build for tests.
poetry run pip install --force-reinstall --no-deps xllamacpp

- name: Cache llm2 models
uses: actions/cache/restore@v5
Expand All @@ -137,6 +148,7 @@ jobs:
env:
APP_VERSION: ${{ fromJson(steps.appinfo.outputs.result).version }}
run: |
mkdir -p "$(pwd)/../../llm2-persistent_storage/"
APP_PERSISTENT_STORAGE="$(pwd)/../../llm2-persistent_storage/" poetry run python3 main.py > ../backend_logs 2>&1 &

- name: Register backend
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later

name: Publish CPU Image
name: Publish Docker Images

on:
push:
Expand All @@ -15,9 +15,13 @@ permissions:

jobs:
push_to_registry:
name: Build image
name: Build ${{ matrix.variant }} image
runs-on: ubuntu-22.04
if: ${{ github.repository_owner == 'nextcloud' }}
strategy:
fail-fast: false
matrix:
variant: [cuda, rocm, cpu]
steps:
- name: Set app env
run: |
Expand Down Expand Up @@ -93,14 +97,29 @@ jobs:
run: |
echo "Extracted version: ${{ env.VERSION }}"

# AppAPI selects ExApp images by tag suffix per the deploy daemon's computeDevice:
# CPU is the unsuffixed default (`<image>:<version>`), CUDA is `<image>:<version>-cuda`,
# ROCm is `<image>:<version>-rocm`.
- name: Compute image tags
id: tags
run: |
if [ "${{ matrix.variant }}" = "cpu" ]; then
SUFFIX=""
else
SUFFIX="-${{ matrix.variant }}"
fi
{
echo "tags<<EOF"
echo "ghcr.io/nextcloud/${{ env.APP_NAME }}:${{ env.VERSION }}${SUFFIX}"
echo "ghcr.io/nextcloud/${{ env.APP_NAME }}:latest${SUFFIX}"
echo "EOF"
} >> "$GITHUB_OUTPUT"

- name: Build container image
uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0
with:
push: true
context: ./${{ env.APP_NAME }}
file: ./${{ env.APP_NAME }}/Dockerfile.${{ matrix.variant }}
platforms: linux/amd64
tags: |
ghcr.io/nextcloud/${{ env.APP_NAME }}:${{ env.VERSION }}
ghcr.io/nextcloud/${{ env.APP_NAME }}:latest
build-args: |
BUILD_TYPE=cpu
tags: ${{ steps.tags.outputs.tags }}
53 changes: 53 additions & 0 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
FROM docker.io/ubuntu:22.04

ENV DEBIAN_FRONTEND=noninteractive

RUN apt update
RUN apt-get update --fix-missing
RUN apt install -y python3 python3-venv pipx build-essential git vim curl
RUN pipx install poetry

# Download and install FRP client into /usr/local/bin.
RUN set -ex; \
ARCH=$(uname -m); \
if [ "$ARCH" = "aarch64" ]; then \
FRP_URL="https://raw.githubusercontent.com/nextcloud/HaRP/main/exapps_dev/frp_0.61.1_linux_arm64.tar.gz"; \
else \
FRP_URL="https://raw.githubusercontent.com/nextcloud/HaRP/main/exapps_dev/frp_0.61.1_linux_amd64.tar.gz"; \
fi; \
echo "Downloading FRP client from $FRP_URL"; \
curl -L "$FRP_URL" -o /tmp/frp.tar.gz; \
tar -C /tmp -xzf /tmp/frp.tar.gz; \
mv /tmp/frp_0.61.1_linux_* /tmp/frp; \
cp /tmp/frp/frpc /usr/local/bin/frpc; \
chmod +x /usr/local/bin/frpc; \
rm -rf /tmp/frp /tmp/frp.tar.gz

ENV DEBIAN_FRONTEND=dialog
ENV PATH="/root/.local/bin:${PATH}"
ENV COMPUTE_DEVICE=CPU

WORKDIR /app

# Install requirements
COPY pyproject.toml .
COPY poetry.lock .
COPY healthcheck.sh .
COPY --chmod=775 start.sh /

RUN poetry install
# pyproject.toml pins the cu128 wheel for GPU production; swap to the PyPI CPU build
# so this image runs without libcuda.so.1 / CUDA runtime libraries.
RUN poetry run pip install --force-reinstall --no-deps xllamacpp

ADD lib /app/lib
ADD models /app/models
ADD default_config /app/default_config

WORKDIR /app/lib
ENTRYPOINT ["/start.sh", "poetry", "run", "python3", "main.py"]

LABEL org.opencontainers.image.source=https://github.com/nextcloud/llm2
HEALTHCHECK --interval=2s --timeout=2s --retries=300 CMD /app/healthcheck.sh
File renamed without changes.
52 changes: 52 additions & 0 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
FROM docker.io/rocm/dev-ubuntu-22.04:6.4.1

ENV DEBIAN_FRONTEND=noninteractive

RUN apt update
RUN apt-get update --fix-missing
RUN apt install -y pipx build-essential git vim curl
RUN pipx install poetry

# Download and install FRP client into /usr/local/bin.
RUN set -ex; \
ARCH=$(uname -m); \
if [ "$ARCH" = "aarch64" ]; then \
FRP_URL="https://raw.githubusercontent.com/nextcloud/HaRP/main/exapps_dev/frp_0.61.1_linux_arm64.tar.gz"; \
else \
FRP_URL="https://raw.githubusercontent.com/nextcloud/HaRP/main/exapps_dev/frp_0.61.1_linux_amd64.tar.gz"; \
fi; \
echo "Downloading FRP client from $FRP_URL"; \
curl -L "$FRP_URL" -o /tmp/frp.tar.gz; \
tar -C /tmp -xzf /tmp/frp.tar.gz; \
mv /tmp/frp_0.61.1_linux_* /tmp/frp; \
cp /tmp/frp/frpc /usr/local/bin/frpc; \
chmod +x /usr/local/bin/frpc; \
rm -rf /tmp/frp /tmp/frp.tar.gz

ENV DEBIAN_FRONTEND=dialog
ENV PATH="/root/.local/bin:${PATH}"

WORKDIR /app

# Install requirements
COPY pyproject.toml .
COPY poetry.lock .
COPY healthcheck.sh .
COPY --chmod=775 start.sh /

RUN poetry install
# pyproject.toml pins the cu128 wheel; swap to the matching ROCm wheel.
RUN poetry run pip install --force-reinstall --no-deps \
--index-url https://xorbitsai.github.io/xllamacpp/whl/rocm-6.4.1 xllamacpp

ADD lib /app/lib
ADD models /app/models
ADD default_config /app/default_config

WORKDIR /app/lib
ENTRYPOINT ["/start.sh", "poetry", "run", "python3", "main.py"]

LABEL org.opencontainers.image.source=https://github.com/nextcloud/llm2
HEALTHCHECK --interval=2s --timeout=2s --retries=300 CMD /app/healthcheck.sh
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,16 @@ See [the Nextcloud admin documentation](https://docs.nextcloud.com/server/latest
## Development installation using Docker

> [!NOTE]
> Currently, running the Docker image requires that your host system have CUDA/NVIDIA drivers installed and is equipped with a GPU capable of performing the required tasks.
> The CUDA image requires that your host system have CUDA/NVIDIA drivers installed and is equipped with a GPU capable of performing the required tasks. The CPU image runs anywhere but is slower.

0. [Install Nvidia drivers and CUDA on your host system](https://gist.github.com/denguir/b21aa66ae7fb1089655dd9de8351a202) and [install NVIDIA Docker toolkit](https://stackoverflow.com/questions/25185405/using-gpu-from-a-docker-container).
0. (CUDA image only) [Install Nvidia drivers and CUDA on your host system](https://gist.github.com/denguir/b21aa66ae7fb1089655dd9de8351a202) and [install NVIDIA Docker toolkit](https://stackoverflow.com/questions/25185405/using-gpu-from-a-docker-container).

1. Build the Docker image:
1. Build the Docker image (pick one). Per the [AppAPI image convention](https://docs.nextcloud.com/server/latest/developer_manual/exapp_development/faq/GpuSupport.html), CPU is the unsuffixed default and GPU variants are suffixed:

```sh
docker build --no-cache -f Dockerfile -t llm2:latest .
docker build --no-cache -f Dockerfile.cpu -t llm2:latest .
docker build --no-cache -f Dockerfile.cuda -t llm2:latest-cuda .
docker build --no-cache -f Dockerfile.rocm -t llm2:latest-rocm .
```

2. Run the Docker image:
Expand Down
23 changes: 22 additions & 1 deletion default_config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n",
"loader_config": {
"n_ctx": 24000,
"n_batch": 8,
"max_tokens": 8192,
"stop": ["<|eot_id|>"],
"temperature": 0.7
Expand All @@ -86,11 +87,21 @@
}
},
"Olmo-3-7B-Instruct-Q4_K_M": {
"prompt": "<|endoftext|><|system|>\n{system_prompt}\n<|user|>\n{system_prompt}\n{user_prompt}\n<|assistant|>\n",
"loader_config": {
"n_ctx": 4096,
"max_tokens": 2048,
"stop": ["<|endoftext|>"],
"temperature": 0.4,
"enable_chat_template": 1,
"chat_template": "{%- set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 -%}{%- if not has_system -%}{{- '<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n' -}}{%- endif -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'user' -%}{{- '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'assistant' -%}{{- '<|im_start|>assistant\n' -}}{%- if message.get('content', none) is not none -%}{{- message['content'] -}}{%- endif -%}{%- if not loop.last -%}{{- '<|im_end|>\n' -}}{%- else -%}{{- eos_token -}}{%- endif -%}{%- endif -%}{%- if loop.last and add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}{%- endfor -%}"
}
},
"Olmo-3-7B-Think-Q4_K_M": {
"prompt": "<|endoftext|><|system|>\n{system_prompt}\n<|user|>\n{system_prompt}\n{user_prompt}\n<|assistant|>\n",
"loader_config": {
"n_ctx": 2096,
"max_tokens": 2048,
"stop": ["<|endoftext|>"],
"temperature": 0.4
}
},
Expand All @@ -102,6 +113,16 @@
"stop": ["<|endoftext|>"],
"temperature": 0.3
}
},
"gemma-4-E4B-it-Q4_K_S": {
"prompt": "<|endoftext|><|system|>\n{system_prompt}\n<|user|>\n{system_prompt}\n{user_prompt}\n<|assistant|>\n",
"loader_config": {
"n_parallel": 1,
"n_ctx": 4096,
"max_tokens": 2048,
"stop": ["<|endoftext|>"],
"temperature": 0.7
}
},
"default": {
"prompt": "<|im_start|> system\n{system_prompt}\n<|im_end|>\n<|im_start|> user\n{user_prompt}\n<|im_end|>\n<|im_start|> assistant\n",
Expand Down
12 changes: 9 additions & 3 deletions lib/change_tone.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,16 @@ class ChangeToneProcessor:
def __init__(self, runnable: Runnable):
self.runnable = runnable

def __call__(self, input_data: dict, context: StreamContext | None = None) -> dict[str, Any]:
"""Process a single input"""
async def __call__(self, input_data: dict, context: StreamContext | None = None) -> dict[str, Any]:
messages = [
SystemMessage(content=self.system_prompt),
HumanMessage(content=self.user_prompt.format_prompt(text=input_data['input'], tone=input_data['tone']).to_string())
]
return {'output': run_runnable_with_streaming(self.runnable, messages, context)}
reasoning_sink: dict[str, str] = {}
output = await run_runnable_with_streaming(
self.runnable,
messages,
context,
reasoning_sink=reasoning_sink,
)
return {'output': output, 'reasoning': reasoning_sink.get('reasoning', '')}
16 changes: 10 additions & 6 deletions lib/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class ChatProcessor:
def __init__(self, runner: Runnable):
self.runnable = runner

def __call__(
async def __call__(
self,
inputs: dict[str, Any],
context: StreamContext | None = None,
Expand All @@ -32,10 +32,14 @@ def __call__(
(message['role'], message['content'])
for message in [json.loads(message) for message in inputs['history']]
] + [('human', inputs['input'])]
reasoning_sink: dict[str, str] = {}
output = await run_runnable_with_streaming(
self.runnable,
messages,
context,
reasoning_sink=reasoning_sink,
)
return {
'output': run_runnable_with_streaming(
self.runnable,
messages,
context,
)
'output': output,
'reasoning': reasoning_sink.get('reasoning', ''),
}
Loading
Loading