-
Notifications
You must be signed in to change notification settings - Fork 60
Open
Labels
Description
What happens?
DuckDB Python fails to read a valid empty Avro container (schema‑only, 0 records). Reading a 1‑record Avro file works. The same empty file appears to be readable in the DuckDB CLI, but fails in Python with Invalid Input Error: Cannot read file block count....
To Reproduce
#!/usr/bin/env python3
"""
Repro: DuckDB Python read_avro fails on empty Avro container.
Requires:
- fastavro
- duckdb python package installed
"""
import tempfile
from pathlib import Path
import duckdb
def main():
tmpdir = Path(tempfile.mkdtemp(prefix="duckdb_avro_empty_"))
try:
empty_avro = tmpdir / "empty.avro"
one_avro = tmpdir / "one.avro"
schema = {
"type": "record",
"name": "TestRecord",
"fields": [
{"name": "id", "type": "string"}
]
}
try:
import fastavro # type: ignore
except Exception as e:
raise RuntimeError("fastavro is required for this repro script") from e
# Non-empty file via fastavro
with open(one_avro, "wb") as out_f:
fastavro.writer(out_f, schema, [{"id": "1"}])
# Empty container via fastavro
with open(empty_avro, "wb") as out_f:
fastavro.writer(out_f, schema, [])
print(f"DuckDB version: {duckdb.__version__}")
con = duckdb.connect()
try:
try:
row = con.execute(
"SELECT extension_version, installed, installed_from, install_path "
"FROM duckdb_extensions() WHERE extension_name = 'avro'"
).fetchone()
if row:
ext_version, installed, installed_from, install_path = row
print(
"DuckDB avro extension: "
f"version={ext_version}, installed={installed}, "
f"from={installed_from}, path={install_path}"
)
else:
print("DuckDB avro extension: not listed")
except Exception as e:
print(f"Could not read DuckDB avro extension version: {e}")
print("Reading one.avro...")
con.execute("SELECT count(*) FROM read_avro(?)", [str(one_avro)]).fetchall()
print("OK: one.avro")
except Exception as e:
print("FAIL: one.avro:", e)
try:
print("Reading empty.avro...")
con.execute("SELECT count(*) FROM read_avro(?)", [str(empty_avro)]).fetchall()
print("OK: empty.avro")
except Exception as e:
print("FAIL: empty.avro:", e)
finally:
con.close()
finally:
print(f"Artifacts in: {tmpdir}")
if __name__ == "__main__":
main()
Output:
DuckDB version: 1.2.2
DuckDB avro extension: version=ed18629, installed=True, from=core, path=/Users/sagi/.duckdb/extensions/v1.2.2/osx_arm64/avro.duckdb_extension
Reading one.avro...
OK: one.avro
Reading empty.avro...
FAIL: empty.avro: Invalid Input Error: Cannot read file block count: Cannot read 1 bytes from memory bufferCannot read 16 bytes from memory bufferCannot read file block count: Cannot read 1 bytes from memory buffer
Artifacts in: /var/folders/gn/pzxmt7f93zx0q1c66vybfd740000gn/T/duckdb_avro_empty_vijp14hz
OS:
OS X 15.7.3
DuckDB Package Version:
1.2.2
Python Version:
3.9.6
Full Name:
Sagi Bashari
Affiliation:
MyHeritage
What is the latest build you tested with? If possible, we recommend testing with the latest nightly build.
I have tested with a stable release
Did you include all relevant data sets for reproducing the issue?
Yes
Did you include all code required to reproduce the issue?
- Yes, I have
Did you include all relevant configuration to reproduce the issue?
- Yes, I have