nextcloud · marcelklehr · Jun 25, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 18, 2026
diff --git a/.github/workflows/integration_test.yml b/.github/workflows/integration_test.yml
@@ -28,7 +28,7 @@ jobs:
       matrix:
         php-versions: [ '8.3' ]
         databases: [ 'sqlite' ]
-        server-versions: [ 'master', 'stable33', 'stable32', 'stable31', 'stable30' ]
+        server-versions: [ 'master', 'stable34', 'stable33', 'stable32', 'stable31', 'stable30' ]
 
     name: Integration test on ☁️${{ matrix.server-versions }} 🐘${{ matrix.php-versions }}
 
@@ -106,13 +106,21 @@ jobs:
           ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$PGSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password
           composer run serve &
 
+      - name: Checkout app_api
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        with:
+          path: apps/app_api/
+          repository: nextcloud/app_api
+          ref: ${{ matrix.server-versions == 'master' && 'main' || matrix.server-versions }}
+          persist-credentials: false
+
       - name: Enable app and app_api
         run: ./occ app:enable -vvv -f app_api
 
       - name: Setup python 3.10
         uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
         with:
-          python-version: '3.10'
+          python-version: '3.13'
           cache: 'pip'
           cache-dependency-path: context_chat_backend/requirements.txt
 
@@ -122,6 +130,9 @@ jobs:
           sudo apt install pipx
           pipx install poetry
           poetry install
+          # pyproject.toml pins xllamacpp to the cu128 wheel, which needs libcuda.so.1 at import
+          # time. The CI runner has no NVIDIA driver, so we swap in the PyPI CPU build for tests.
+          poetry run pip install --force-reinstall --no-deps xllamacpp
 
       - name: Cache llm2 models
         uses: actions/cache/restore@v5
@@ -137,6 +148,7 @@ jobs:
         env:
           APP_VERSION: ${{ fromJson(steps.appinfo.outputs.result).version }}
         run: |
+          mkdir -p "$(pwd)/../../llm2-persistent_storage/"
           APP_PERSISTENT_STORAGE="$(pwd)/../../llm2-persistent_storage/" poetry run python3 main.py > ../backend_logs 2>&1 &
 
       - name: Register backend

diff --git a/.github/workflows/publish-docker-cpu.yml → .github/workflows/publish-docker.yml b/.github/workflows/publish-docker-cpu.yml → .github/workflows/publish-docker.yml
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2025 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
-name: Publish CPU Image
+name: Publish Docker Images
 
 on:
   push:
@@ -15,9 +15,13 @@ permissions:
 
 jobs:
   push_to_registry:
-    name: Build image
+    name: Build ${{ matrix.variant }} image
     runs-on: ubuntu-22.04
     if: ${{ github.repository_owner == 'nextcloud' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        variant: [cuda, rocm, cpu]
     steps:
       - name: Set app env
         run: |
@@ -93,14 +97,29 @@ jobs:
         run: |
           echo "Extracted version: ${{ env.VERSION }}"
 
+      # AppAPI selects ExApp images by tag suffix per the deploy daemon's computeDevice:
+      # CPU is the unsuffixed default (`<image>:<version>`), CUDA is `<image>:<version>-cuda`,
+      # ROCm is `<image>:<version>-rocm`.
+      - name: Compute image tags
+        id: tags
+        run: |
+          if [ "${{ matrix.variant }}" = "cpu" ]; then
+            SUFFIX=""
+          else
+            SUFFIX="-${{ matrix.variant }}"
+          fi
+          {
+            echo "tags<<EOF"
+            echo "ghcr.io/nextcloud/${{ env.APP_NAME }}:${{ env.VERSION }}${SUFFIX}"
+            echo "ghcr.io/nextcloud/${{ env.APP_NAME }}:latest${SUFFIX}"
+            echo "EOF"
+          } >> "$GITHUB_OUTPUT"
+
       - name: Build container image
         uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0
         with:
           push: true
           context: ./${{ env.APP_NAME }}
+          file: ./${{ env.APP_NAME }}/Dockerfile.${{ matrix.variant }}
           platforms: linux/amd64
-          tags: |
-            ghcr.io/nextcloud/${{ env.APP_NAME }}:${{ env.VERSION }}
-            ghcr.io/nextcloud/${{ env.APP_NAME }}:latest
-          build-args: |
-            BUILD_TYPE=cpu
+          tags: ${{ steps.tags.outputs.tags }}
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
+# SPDX-License-Identifier: AGPL-3.0-or-later
+FROM docker.io/ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update
+RUN apt-get update --fix-missing
+RUN apt install -y python3 python3-venv pipx build-essential git vim curl
+RUN pipx install poetry
+
+# Download and install FRP client into /usr/local/bin.
+RUN set -ex; \
+    ARCH=$(uname -m); \
+    if [ "$ARCH" = "aarch64" ]; then \
+      FRP_URL="https://raw.githubusercontent.com/nextcloud/HaRP/main/exapps_dev/frp_0.61.1_linux_arm64.tar.gz"; \
+    else \
+      FRP_URL="https://raw.githubusercontent.com/nextcloud/HaRP/main/exapps_dev/frp_0.61.1_linux_amd64.tar.gz"; \
+    fi; \
+    echo "Downloading FRP client from $FRP_URL"; \
+    curl -L "$FRP_URL" -o /tmp/frp.tar.gz; \
+    tar -C /tmp -xzf /tmp/frp.tar.gz; \
+    mv /tmp/frp_0.61.1_linux_* /tmp/frp; \
+    cp /tmp/frp/frpc /usr/local/bin/frpc; \
+    chmod +x /usr/local/bin/frpc; \
+    rm -rf /tmp/frp /tmp/frp.tar.gz
+
+ENV DEBIAN_FRONTEND=dialog
+ENV PATH="/root/.local/bin:${PATH}"
+ENV COMPUTE_DEVICE=CPU
+
+WORKDIR /app
+
+# Install requirements
+COPY pyproject.toml .
+COPY poetry.lock .
+COPY healthcheck.sh .
+COPY --chmod=775 start.sh /
+
+RUN poetry install
+# pyproject.toml pins the cu128 wheel for GPU production; swap to the PyPI CPU build
+# so this image runs without libcuda.so.1 / CUDA runtime libraries.
+RUN poetry run pip install --force-reinstall --no-deps xllamacpp
+
+ADD lib /app/lib
+ADD models /app/models
+ADD default_config /app/default_config
+
+WORKDIR /app/lib
+ENTRYPOINT ["/start.sh", "poetry", "run", "python3", "main.py"]
+
+LABEL org.opencontainers.image.source=https://github.com/nextcloud/llm2
+HEALTHCHECK --interval=2s --timeout=2s --retries=300 CMD /app/healthcheck.sh
diff --git a/Dockerfile → Dockerfile.cuda b/Dockerfile → Dockerfile.cuda
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
+# SPDX-License-Identifier: AGPL-3.0-or-later
+FROM docker.io/rocm/dev-ubuntu-22.04:6.4.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update
+RUN apt-get update --fix-missing
+RUN apt install -y pipx build-essential git vim curl
+RUN pipx install poetry
+
+# Download and install FRP client into /usr/local/bin.
+RUN set -ex; \
+    ARCH=$(uname -m); \
+    if [ "$ARCH" = "aarch64" ]; then \
+      FRP_URL="https://raw.githubusercontent.com/nextcloud/HaRP/main/exapps_dev/frp_0.61.1_linux_arm64.tar.gz"; \
+    else \
+      FRP_URL="https://raw.githubusercontent.com/nextcloud/HaRP/main/exapps_dev/frp_0.61.1_linux_amd64.tar.gz"; \
+    fi; \
+    echo "Downloading FRP client from $FRP_URL"; \
+    curl -L "$FRP_URL" -o /tmp/frp.tar.gz; \
+    tar -C /tmp -xzf /tmp/frp.tar.gz; \
+    mv /tmp/frp_0.61.1_linux_* /tmp/frp; \
+    cp /tmp/frp/frpc /usr/local/bin/frpc; \
+    chmod +x /usr/local/bin/frpc; \
+    rm -rf /tmp/frp /tmp/frp.tar.gz
+
+ENV DEBIAN_FRONTEND=dialog
+ENV PATH="/root/.local/bin:${PATH}"
+
+WORKDIR /app
+
+# Install requirements
+COPY pyproject.toml .
+COPY poetry.lock .
+COPY healthcheck.sh .
+COPY --chmod=775 start.sh /
+
+RUN poetry install
+# pyproject.toml pins the cu128 wheel; swap to the matching ROCm wheel.
+RUN poetry run pip install --force-reinstall --no-deps \
+    --index-url https://xorbitsai.github.io/xllamacpp/whl/rocm-6.4.1 xllamacpp
+
+ADD lib /app/lib
+ADD models /app/models
+ADD default_config /app/default_config
+
+WORKDIR /app/lib
+ENTRYPOINT ["/start.sh", "poetry", "run", "python3", "main.py"]
+
+LABEL org.opencontainers.image.source=https://github.com/nextcloud/llm2
+HEALTHCHECK --interval=2s --timeout=2s --retries=300 CMD /app/healthcheck.sh
diff --git a/README.md b/README.md
@@ -70,14 +70,16 @@ See [the Nextcloud admin documentation](https://docs.nextcloud.com/server/latest
 ## Development installation using Docker
 
 > [!NOTE]
-> Currently, running the Docker image requires that your host system have CUDA/NVIDIA drivers installed and is equipped with a GPU capable of performing the required tasks.
+> The CUDA image requires that your host system have CUDA/NVIDIA drivers installed and is equipped with a GPU capable of performing the required tasks. The CPU image runs anywhere but is slower.
 
-0. [Install Nvidia drivers and CUDA on your host system](https://gist.github.com/denguir/b21aa66ae7fb1089655dd9de8351a202) and [install NVIDIA Docker toolkit](https://stackoverflow.com/questions/25185405/using-gpu-from-a-docker-container).
+0. (CUDA image only) [Install Nvidia drivers and CUDA on your host system](https://gist.github.com/denguir/b21aa66ae7fb1089655dd9de8351a202) and [install NVIDIA Docker toolkit](https://stackoverflow.com/questions/25185405/using-gpu-from-a-docker-container).
 
-1. Build the Docker image:
+1. Build the Docker image (pick one). Per the [AppAPI image convention](https://docs.nextcloud.com/server/latest/developer_manual/exapp_development/faq/GpuSupport.html), CPU is the unsuffixed default and GPU variants are suffixed:
 
    ```sh
-   docker build --no-cache -f Dockerfile -t llm2:latest .
+   docker build --no-cache -f Dockerfile.cpu  -t llm2:latest      .
+   docker build --no-cache -f Dockerfile.cuda -t llm2:latest-cuda .
+   docker build --no-cache -f Dockerfile.rocm -t llm2:latest-rocm .
    ```
 
 2. Run the Docker image:

diff --git a/default_config/config.json b/default_config/config.json
@@ -62,6 +62,7 @@
         "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n",
         "loader_config": {
             "n_ctx": 24000,
+            "n_batch": 8,
             "max_tokens": 8192,
             "stop": ["<|eot_id|>"],
             "temperature": 0.7
@@ -86,11 +87,21 @@
         }
     },
     "Olmo-3-7B-Instruct-Q4_K_M": {
-        "prompt": "<|endoftext|><|system|>\n{system_prompt}\n<|user|>\n{system_prompt}\n{user_prompt}\n<|assistant|>\n",
         "loader_config": {
             "n_ctx": 4096,
             "max_tokens": 2048,
             "stop": ["<|endoftext|>"],
+            "temperature": 0.4,
+            "enable_chat_template": 1,
+            "chat_template": "{%- set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 -%}{%- if not has_system -%}{{- '<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n' -}}{%- endif -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'user' -%}{{- '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'assistant' -%}{{- '<|im_start|>assistant\n' -}}{%- if message.get('content', none) is not none -%}{{- message['content'] -}}{%- endif -%}{%- if not loop.last -%}{{- '<|im_end|>\n' -}}{%- else -%}{{- eos_token -}}{%- endif -%}{%- endif -%}{%- if loop.last and add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}{%- endfor -%}"
+        }
+    },
+    "Olmo-3-7B-Think-Q4_K_M": {
+        "prompt": "<|endoftext|><|system|>\n{system_prompt}\n<|user|>\n{system_prompt}\n{user_prompt}\n<|assistant|>\n",
+        "loader_config": {
+            "n_ctx": 2096,
+            "max_tokens": 2048,
+            "stop": ["<|endoftext|>"],
             "temperature": 0.4
         }
     },
@@ -102,6 +113,16 @@
             "stop": ["<|endoftext|>"],
             "temperature": 0.3
         }
+    },
+     "gemma-4-E4B-it-Q4_K_S": {
+        "prompt": "<|endoftext|><|system|>\n{system_prompt}\n<|user|>\n{system_prompt}\n{user_prompt}\n<|assistant|>\n",
+        "loader_config": {
+            "n_parallel": 1,
+            "n_ctx": 4096,
+            "max_tokens": 2048,
+            "stop": ["<|endoftext|>"],
+            "temperature": 0.7
+        }
     },
     "default": {
         "prompt": "<|im_start|> system\n{system_prompt}\n<|im_end|>\n<|im_start|> user\n{user_prompt}\n<|im_end|>\n<|im_start|> assistant\n",

diff --git a/lib/change_tone.py b/lib/change_tone.py
@@ -35,10 +35,16 @@ class ChangeToneProcessor:
     def __init__(self, runnable: Runnable):
         self.runnable = runnable
 
-    def __call__(self, input_data: dict, context: StreamContext | None = None) -> dict[str, Any]:
-        """Process a single input"""
+    async def __call__(self, input_data: dict, context: StreamContext | None = None) -> dict[str, Any]:
         messages = [
             SystemMessage(content=self.system_prompt),
             HumanMessage(content=self.user_prompt.format_prompt(text=input_data['input'], tone=input_data['tone']).to_string())
         ]
-        return {'output': run_runnable_with_streaming(self.runnable, messages, context)}
+        reasoning_sink: dict[str, str] = {}
+        output = await run_runnable_with_streaming(
+            self.runnable,
+            messages,
+            context,
+            reasoning_sink=reasoning_sink,
+        )
+        return {'output': output, 'reasoning': reasoning_sink.get('reasoning', '')}
diff --git a/lib/chat.py b/lib/chat.py
@@ -20,7 +20,7 @@ class ChatProcessor:
     def __init__(self, runner: Runnable):
         self.runnable = runner
 
-    def __call__(
+    async def __call__(
             self,
             inputs: dict[str, Any],
             context: StreamContext | None = None,
@@ -32,10 +32,14 @@ def __call__(
             (message['role'], message['content'])
             for message in [json.loads(message) for message in inputs['history']]
         ] + [('human', inputs['input'])]
+        reasoning_sink: dict[str, str] = {}
+        output = await run_runnable_with_streaming(
+            self.runnable,
+            messages,
+            context,
+            reasoning_sink=reasoning_sink,
+        )
         return {
-            'output': run_runnable_with_streaming(
-                self.runnable,
-                messages,
-                context,
-            )
+            'output': output,
+            'reasoning': reasoning_sink.get('reasoning', ''),
         }