Skip to content

DuckDB Python read_avro fails on empty Avro container #276

@sagi-mh

Description

@sagi-mh

What happens?

DuckDB Python fails to read a valid empty Avro container (schema‑only, 0 records). Reading a 1‑record Avro file works. The same empty file appears to be readable in the DuckDB CLI, but fails in Python with Invalid Input Error: Cannot read file block count....

To Reproduce

#!/usr/bin/env python3
"""
Repro: DuckDB Python read_avro fails on empty Avro container.

Requires:
  - fastavro
  - duckdb python package installed
"""
import tempfile
from pathlib import Path

import duckdb


def main():
    tmpdir = Path(tempfile.mkdtemp(prefix="duckdb_avro_empty_"))
    try:
        empty_avro = tmpdir / "empty.avro"
        one_avro = tmpdir / "one.avro"

        schema = {
            "type": "record",
            "name": "TestRecord",
            "fields": [
                {"name": "id", "type": "string"}
            ]
        }
        try:
            import fastavro  # type: ignore
        except Exception as e:
            raise RuntimeError("fastavro is required for this repro script") from e

        # Non-empty file via fastavro
        with open(one_avro, "wb") as out_f:
            fastavro.writer(out_f, schema, [{"id": "1"}])

        # Empty container via fastavro
        with open(empty_avro, "wb") as out_f:
            fastavro.writer(out_f, schema, [])

        print(f"DuckDB version: {duckdb.__version__}")
        con = duckdb.connect()
        try:
            try:
                row = con.execute(
                    "SELECT extension_version, installed, installed_from, install_path "
                    "FROM duckdb_extensions() WHERE extension_name = 'avro'"
                ).fetchone()
                if row:
                    ext_version, installed, installed_from, install_path = row
                    print(
                        "DuckDB avro extension: "
                        f"version={ext_version}, installed={installed}, "
                        f"from={installed_from}, path={install_path}"
                    )
                else:
                    print("DuckDB avro extension: not listed")
            except Exception as e:
                print(f"Could not read DuckDB avro extension version: {e}")
            print("Reading one.avro...")
            con.execute("SELECT count(*) FROM read_avro(?)", [str(one_avro)]).fetchall()
            print("OK: one.avro")
        except Exception as e:
            print("FAIL: one.avro:", e)

        try:
            print("Reading empty.avro...")
            con.execute("SELECT count(*) FROM read_avro(?)", [str(empty_avro)]).fetchall()
            print("OK: empty.avro")
        except Exception as e:
            print("FAIL: empty.avro:", e)
        finally:
            con.close()
    finally:
        print(f"Artifacts in: {tmpdir}")


if __name__ == "__main__":
    main()

Output:

DuckDB version: 1.2.2
DuckDB avro extension: version=ed18629, installed=True, from=core, path=/Users/sagi/.duckdb/extensions/v1.2.2/osx_arm64/avro.duckdb_extension
Reading one.avro...
OK: one.avro
Reading empty.avro...
FAIL: empty.avro: Invalid Input Error: Cannot read file block count: Cannot read 1 bytes from memory bufferCannot read 16 bytes from memory bufferCannot read file block count: Cannot read 1 bytes from memory buffer
Artifacts in: /var/folders/gn/pzxmt7f93zx0q1c66vybfd740000gn/T/duckdb_avro_empty_vijp14hz

OS:

OS X 15.7.3

DuckDB Package Version:

1.2.2

Python Version:

3.9.6

Full Name:

Sagi Bashari

Affiliation:

MyHeritage

What is the latest build you tested with? If possible, we recommend testing with the latest nightly build.

I have tested with a stable release

Did you include all relevant data sets for reproducing the issue?

Yes

Did you include all code required to reproduce the issue?

  • Yes, I have

Did you include all relevant configuration to reproduce the issue?

  • Yes, I have

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions