From b33d2c28127a000266766d41c86c9228536c1326 Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 21 May 2026 22:54:38 +0400 Subject: [PATCH 01/36] ipc ingest --- client/ingest.go | 154 +++++++ client/ingest_test.go | 172 ++++++++ engine.go | 1 + integration-test/ingest/docker-compose.yml | 15 + integration-test/ingest/ingest_test.go | 375 ++++++++++++++++++ integration-test/ingest/run.sh | 29 ++ integration-test/ingest/testdata/init.sql | 13 + .../testdata/schemas/pg_ingest/schema.graphql | 8 + ipc-ingest.go | 351 ++++++++++++++++ pkg/db/pool.go | 12 + 10 files changed, 1130 insertions(+) create mode 100644 client/ingest.go create mode 100644 client/ingest_test.go create mode 100644 integration-test/ingest/docker-compose.yml create mode 100644 integration-test/ingest/ingest_test.go create mode 100755 integration-test/ingest/run.sh create mode 100644 integration-test/ingest/testdata/init.sql create mode 100644 integration-test/ingest/testdata/schemas/pg_ingest/schema.graphql create mode 100644 ipc-ingest.go diff --git a/client/ingest.go b/client/ingest.go new file mode 100644 index 00000000..0bffc18b --- /dev/null +++ b/client/ingest.go @@ -0,0 +1,154 @@ +package client + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/ipc" +) + +// IngestResult is the success payload returned by /ipc/ingest. +type IngestResult struct { + DataObject string `json:"data_object"` + Inserted int64 `json:"inserted"` + Columns []string `json:"columns"` +} + +// Ingest streams the records produced by reader into the target data object. +// Columns from the Arrow schema must match insertable fields of the table +// (computed/virtual/reference fields are rejected by the server). +// +// dataObject is either a dotted GraphQL Query path (e.g. "pg_store.public.events") +// or a bare hugr type name. The client serializes the reader as an Apache Arrow +// IPC stream and POSTs it to /ipc/ingest on the configured base URL. +// +// The reader is fully drained on success; on error the caller may inspect the +// reader's remaining state but it should be released by the caller in all cases. +func (c *Client) Ingest(ctx context.Context, dataObject string, reader array.RecordReader) (*IngestResult, error) { + if dataObject == "" { + return nil, errors.New("hugr ingest: data_object is required") + } + if reader == nil { + return nil, errors.New("hugr ingest: reader is nil") + } + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + iw := ipc.NewWriter(pw, ipc.WithSchema(reader.Schema())) + var streamErr error + for reader.Next() { + rec := reader.RecordBatch() + if rec == nil { + continue + } + if err := iw.Write(rec); err != nil { + streamErr = fmt.Errorf("write arrow record: %w", err) + break + } + } + if streamErr == nil { + if err := reader.Err(); err != nil { + streamErr = fmt.Errorf("read arrow record: %w", err) + } + } + if err := iw.Close(); err != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", err) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + endpoint, err := buildIngestURL(c.url, dataObject) + if err != nil { + _ = pr.Close() + return nil, err + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, pr) + if err != nil { + _ = pr.Close() + return nil, err + } + req.Header.Set("Content-Type", "application/vnd.apache.arrow.stream") + setAsUserHeaders(ctx, req) + + resp, err := c.c.Do(req) + if err != nil { + _ = pr.CloseWithError(err) + return nil, err + } + defer resp.Body.Close() + + // Surface a writer-side error in preference to the (likely derivative) + // HTTP error. + if werr := <-writeErr; werr != nil { + return nil, werr + } + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + var ebody struct { + Error string `json:"error"` + } + _ = json.Unmarshal(body, &ebody) + if ebody.Error == "" { + ebody.Error = strings.TrimSpace(string(body)) + } + if ebody.Error == "" { + ebody.Error = resp.Status + } + return nil, fmt.Errorf("hugr ingest: %s: %s", resp.Status, ebody.Error) + } + + var out IngestResult + if err := json.NewDecoder(resp.Body).Decode(&out); err != nil { + return nil, fmt.Errorf("decode ingest response: %w", err) + } + return &out, nil +} + +// IngestRecord is a single-batch convenience wrapper around Ingest. It builds +// an array.RecordReader from a single arrow.RecordBatch and forwards. +func (c *Client) IngestRecord(ctx context.Context, dataObject string, rec arrow.RecordBatch) (*IngestResult, error) { + if rec == nil { + return nil, errors.New("hugr ingest: record is nil") + } + rr, err := array.NewRecordReader(rec.Schema(), []arrow.RecordBatch{rec}) + if err != nil { + return nil, fmt.Errorf("build record reader: %w", err) + } + defer rr.Release() + return c.Ingest(ctx, dataObject, rr) +} + +// buildIngestURL derives the /ipc/ingest endpoint from the client's base /ipc URL. +// Accepts both ".../ipc" (canonical) and ".../ipc/" forms. +func buildIngestURL(base, dataObject string) (string, error) { + u, err := url.Parse(base) + if err != nil { + return "", fmt.Errorf("invalid hugr url %q: %w", base, err) + } + path := strings.TrimSuffix(u.Path, "/") + switch { + case strings.HasSuffix(path, "/ipc"): + u.Path = path + "/ingest" + case strings.HasSuffix(path, "/ipc/ingest"): + // already pointed at ingest endpoint — keep as-is + default: + u.Path = path + "/ipc/ingest" + } + q := u.Query() + q.Set("data_object", dataObject) + u.RawQuery = q.Encode() + return u.String(), nil +} diff --git a/client/ingest_test.go b/client/ingest_test.go new file mode 100644 index 00000000..338ffcda --- /dev/null +++ b/client/ingest_test.go @@ -0,0 +1,172 @@ +package client + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuildIngestURL(t *testing.T) { + tests := []struct { + name string + base string + dataObject string + want string + }{ + { + name: "canonical /ipc base", + base: "http://localhost:15000/ipc", + dataObject: "pg_store.public.events", + want: "http://localhost:15000/ipc/ingest?data_object=pg_store.public.events", + }, + { + name: "trailing slash on /ipc", + base: "http://localhost:15000/ipc/", + dataObject: "events", + want: "http://localhost:15000/ipc/ingest?data_object=events", + }, + { + name: "base without /ipc", + base: "http://localhost:15000", + dataObject: "events", + want: "http://localhost:15000/ipc/ingest?data_object=events", + }, + { + name: "base already at /ipc/ingest", + base: "http://localhost:15000/ipc/ingest", + dataObject: "events", + want: "http://localhost:15000/ipc/ingest?data_object=events", + }, + { + name: "data_object with special chars is encoded", + base: "http://localhost:15000/ipc", + dataObject: "schema.table with space", + want: "http://localhost:15000/ipc/ingest?data_object=schema.table+with+space", + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, err := buildIngestURL(tc.base, tc.dataObject) + require.NoError(t, err) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestBuildIngestURL_BadBase(t *testing.T) { + _, err := buildIngestURL("://not a url", "events") + require.Error(t, err) +} + +// TestIngest_RoundTrip exercises the full client path against an in-memory +// HTTP server: it verifies the URL, headers, that the body is a valid Arrow +// IPC stream, and that the success response is parsed back into IngestResult. +func TestIngest_RoundTrip(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "id", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: true}, + }, nil) + + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{1, 2, 3}, nil) + b.Field(1).(*array.StringBuilder).AppendValues([]string{"a", "b", "c"}, nil) + rec := b.NewRecord() + defer rec.Release() + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + if r.URL.Path != "/ipc/ingest" { + t.Errorf("expected /ipc/ingest, got %s", r.URL.Path) + } + if got := r.URL.Query().Get("data_object"); got != "ns.mytable" { + t.Errorf("expected data_object=ns.mytable, got %q", got) + } + if ct := r.Header.Get("Content-Type"); !strings.HasPrefix(ct, "application/vnd.apache.arrow.stream") { + t.Errorf("unexpected content-type: %s", ct) + } + + body, err := io.ReadAll(r.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + rr, err := ipc.NewReader(bytes.NewReader(body), ipc.WithAllocator(pool)) + if err != nil { + t.Fatalf("decode body as arrow stream: %v", err) + } + defer rr.Release() + var rows int64 + for rr.Next() { + rows += rr.RecordBatch().NumRows() + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{ + "data_object": r.URL.Query().Get("data_object"), + "inserted": rows, + "columns": []string{"id", "name"}, + }) + })) + t.Cleanup(srv.Close) + + c := NewClient(srv.URL + "/ipc") + res, err := c.IngestRecord(context.Background(), "ns.mytable", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, "ns.mytable", res.DataObject) + assert.Equal(t, int64(3), res.Inserted) + assert.Equal(t, []string{"id", "name"}, res.Columns) +} + +func TestIngest_ServerError(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecord() + defer rec.Release() + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "column foo is not defined"}) + })) + t.Cleanup(srv.Close) + + c := NewClient(srv.URL + "/ipc") + _, err := c.IngestRecord(context.Background(), "ns.x", rec) + require.Error(t, err) + assert.True(t, strings.Contains(err.Error(), "column foo is not defined"), + "error should surface server message, got: %v", err) +} + +func TestIngest_NilReader(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.Ingest(context.Background(), "ns.x", nil) + require.Error(t, err) + assert.True(t, errors.Is(err, err)) +} + +func TestIngest_EmptyDataObject(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestRecord(context.Background(), "", nil) + require.Error(t, err) +} diff --git a/engine.go b/engine.go index 4f6540c2..faa9a82f 100644 --- a/engine.go +++ b/engine.go @@ -391,6 +391,7 @@ func (s *Service) endpoints() { s.router.Handle("/query", mw(http.HandlerFunc(s.queryHandler))) s.router.Handle("/jq-query", mw(http.HandlerFunc(s.jqHandler))) s.router.Handle("/ipc", mw(http.HandlerFunc(s.ipcHandler))) + s.router.Handle("/ipc/ingest", mw(http.HandlerFunc(s.ipcIngestHandler))) s.router.Handle("/subscribe", mw(http.HandlerFunc(s.subscribeHandler))) // s.router.Handle("/schema", mw(http.HandlerFunc(s.schemaHandler))) // disabled: schemaHandler blocked on gqlparser requiring *ast.Schema diff --git a/integration-test/ingest/docker-compose.yml b/integration-test/ingest/docker-compose.yml new file mode 100644 index 00000000..7db54aa7 --- /dev/null +++ b/integration-test/ingest/docker-compose.yml @@ -0,0 +1,15 @@ +services: + postgres: + image: postgres:16 + environment: + POSTGRES_DB: ingestdb + POSTGRES_USER: test + POSTGRES_PASSWORD: test + ports: ["5435:5432"] + volumes: + - ./testdata/init.sql:/docker-entrypoint-initdb.d/01-init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U test -d ingestdb"] + interval: 2s + timeout: 5s + retries: 15 diff --git a/integration-test/ingest/ingest_test.go b/integration-test/ingest/ingest_test.go new file mode 100644 index 00000000..07c9f7dd --- /dev/null +++ b/integration-test/ingest/ingest_test.go @@ -0,0 +1,375 @@ +//go:build duckdb_arrow + +package ingest_test + +import ( + "bytes" + "context" + "database/sql" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + "time" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + _ "github.com/jackc/pgx/v5/stdlib" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + hugr "github.com/hugr-lab/query-engine" + hugrclient "github.com/hugr-lab/query-engine/client" + "github.com/hugr-lab/query-engine/pkg/auth" + coredb "github.com/hugr-lab/query-engine/pkg/data-sources/sources/runtime/core-db" + "github.com/hugr-lab/query-engine/pkg/db" +) + +const ( + envPostgresDSN = "INGEST_POSTGRES_DSN" + envSchemasPath = "HUGR_INGEST_SCHEMAS_PATH" +) + +// ingestEnv encapsulates a hugr service + an HTTP test server in front of it +// plus a direct sql.DB handle to the underlying postgres for verification. +type ingestEnv struct { + service *hugr.Service + server *httptest.Server + pgConn *sql.DB + client *hugrclient.Client + dsName string +} + +func setupEnv(t *testing.T) *ingestEnv { + t.Helper() + dsn := os.Getenv(envPostgresDSN) + if dsn == "" { + t.Skipf("%s not set — run integration-test/ingest/run.sh to spin up a postgres container", envPostgresDSN) + } + schemasPath := os.Getenv(envSchemasPath) + if schemasPath == "" { + // fall back to repo-relative path + schemasPath = filepath.Join("testdata", "schemas") + } + abs, err := filepath.Abs(schemasPath) + require.NoError(t, err) + require.DirExists(t, filepath.Join(abs, "pg_ingest")) + + ctx := context.Background() + + service, err := hugr.New(hugr.Config{ + Debug: true, + DB: db.Config{}, + CoreDB: coredb.New(coredb.Config{}), + Auth: &auth.Config{ + Providers: []auth.AuthProvider{ + auth.NewAnonymous(auth.AnonymousConfig{ + Allowed: true, + Role: "admin", + }), + }, + }, + }) + require.NoError(t, err) + require.NoError(t, service.Init(ctx)) + + // Register & load the postgres data source pointed at the test database. + mustQuery(t, ctx, service, `mutation($data: core_data_sources_mut_input_data!) { + core { insert_data_sources(data: $data) { name } } + }`, map[string]any{ + "data": map[string]any{ + "name": "pg_ingest", + "type": "postgres", + "prefix": "pg_ingest", + "as_module": true, + "path": dsn, + "catalogs": []map[string]any{{ + "name": "pg_ingest", + "type": "localFS", + "path": filepath.Join(abs, "pg_ingest"), + }}, + }, + }) + mustQuery(t, ctx, service, `mutation { function { core { load_data_source(name: "pg_ingest") { success message } } } }`, nil) + + srv := httptest.NewServer(service) + + pgConn, err := sql.Open("pgx", dsn) + require.NoError(t, err) + require.NoError(t, pgConn.PingContext(ctx)) + + // Truncate before each suite to guarantee determinism. + _, err = pgConn.ExecContext(ctx, "TRUNCATE TABLE events RESTART IDENTITY") + require.NoError(t, err) + + c := hugrclient.NewClient(srv.URL + "/ipc") + + env := &ingestEnv{ + service: service, + server: srv, + pgConn: pgConn, + client: c, + dsName: "pg_ingest", + } + t.Cleanup(func() { + srv.Close() + _ = pgConn.Close() + service.Close() + }) + return env +} + +func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, vars map[string]any) { + t.Helper() + res, err := s.Query(ctx, q, vars) + require.NoError(t, err) + if res.Err() != nil { + require.NoErrorf(t, res.Err(), "graphql error for query: %s", q) + } + res.Close() +} + +// makeEventsRecord builds a single Arrow RecordBatch with the columns of the +// pg_ingest.events table (excluding id, which is autogen). +func makeEventsRecord(t *testing.T, names []string, values []float64, active []bool, payload []string, created []arrow.Timestamp) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + b.Field(1).(*array.Float64Builder).AppendValues(values, nil) + b.Field(2).(*array.BooleanBuilder).AppendValues(active, nil) + pBuilder := b.Field(3).(*array.StringBuilder) + for _, p := range payload { + if p == "" { + pBuilder.AppendNull() + } else { + pBuilder.Append(p) + } + } + tsBuilder := b.Field(4).(*array.TimestampBuilder) + tsBuilder.AppendValues(created, nil) + return b.NewRecord() +} + +// --- Tests ---------------------------------------------------------------- + +func TestIngest_Postgres_RoundTrip(t *testing.T) { + env := setupEnv(t) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"alpha", "beta", "gamma"}, + []float64{1.5, 2.5, 3.5}, + []bool{true, false, true}, + []string{`{"k":"v"}`, "", `{"x":1}`}, + []arrow.Timestamp{now, now, now}, + ) + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, "pg_ingest.events", res.DataObject) + assert.Equal(t, int64(3), res.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, res.Columns) + + // Verify by reading directly from postgres. + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 3, count) + + rows, err := env.pgConn.Query("SELECT name, value, is_active, payload IS NOT NULL FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + var ( + gotNames []string + gotValues []float64 + gotActive []bool + gotHasJSON []bool + ) + for rows.Next() { + var n string + var v float64 + var a, j bool + require.NoError(t, rows.Scan(&n, &v, &a, &j)) + gotNames = append(gotNames, n) + gotValues = append(gotValues, v) + gotActive = append(gotActive, a) + gotHasJSON = append(gotHasJSON, j) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"alpha", "beta", "gamma"}, gotNames) + assert.Equal(t, []float64{1.5, 2.5, 3.5}, gotValues) + assert.Equal(t, []bool{true, false, true}, gotActive) + assert.Equal(t, []bool{true, false, true}, gotHasJSON) // beta has NULL payload +} + +func TestIngest_Postgres_MultipleBatches(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + mk := func(names []string) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + vals := make([]float64, len(names)) + for i := range vals { + vals[i] = float64(i) + } + b.Field(1).(*array.Float64Builder).AppendValues(vals, nil) + active := make([]bool, len(names)) + for i := range active { + active[i] = true + } + b.Field(2).(*array.BooleanBuilder).AppendValues(active, nil) + b.Field(3).(*array.StringBuilder).AppendNulls(len(names)) + ts := make([]arrow.Timestamp, len(names)) + for i := range ts { + ts[i] = arrow.Timestamp(time.Now().UTC().UnixMicro()) + } + b.Field(4).(*array.TimestampBuilder).AppendValues(ts, nil) + return b.NewRecord() + } + rec1 := mk([]string{"a", "b"}) + defer rec1.Release() + rec2 := mk([]string{"c", "d", "e"}) + defer rec2.Release() + + rr, err := array.NewRecordReader(schema, []arrow.RecordBatch{rec1, rec2}) + require.NoError(t, err) + defer rr.Release() + + res, err := env.client.Ingest(context.Background(), "pg_ingest.events", rr) + require.NoError(t, err) + assert.Equal(t, int64(5), res.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 5, count) +} + +func TestIngest_Postgres_UnknownColumn(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "not_a_column", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).AppendValues([]string{"x"}, nil) + b.Field(1).(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecord() + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.Error(t, err) + assert.Contains(t, err.Error(), "not_a_column") + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 0, count, "no rows should have been inserted on validation failure") +} + +func TestIngest_Postgres_UnknownDataObject(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecord() + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), "pg_ingest.does_not_exist", rec) + require.Error(t, err) +} + +// TestIngest_HTTP_Direct exercises low-level HTTP behaviour that the typed +// client smoothes over: bad Content-Type, missing data_object, wrong method. +// It writes a small Arrow stream to validate request parsing of /ipc/ingest. +func TestIngest_HTTP_Direct(t *testing.T) { + env := setupEnv(t) + + // Missing data_object. + resp, err := http.Post(env.server.URL+"/ipc/ingest", "application/vnd.apache.arrow.stream", bytes.NewReader(nil)) + require.NoError(t, err) + b, _ := io.ReadAll(resp.Body) + resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) + + // Wrong method. + req, _ := http.NewRequest(http.MethodGet, env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", nil) + resp, err = http.DefaultClient.Do(req) + require.NoError(t, err) + resp.Body.Close() + assert.Equal(t, http.StatusMethodNotAllowed, resp.StatusCode) + + // Wrong content type. + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "text/plain", bytes.NewReader([]byte("hello"))) + require.NoError(t, err) + resp.Body.Close() + assert.Equal(t, http.StatusUnsupportedMediaType, resp.StatusCode) + + // Body is not a valid Arrow stream. + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", bytes.NewReader([]byte("not arrow"))) + require.NoError(t, err) + b, _ = io.ReadAll(resp.Body) + resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) + + // Happy-path direct POST returning JSON. + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + bld := array.NewRecordBuilder(pool, schema) + bld.Field(0).(*array.StringBuilder).AppendValues([]string{"direct"}, nil) + bld.Field(1).(*array.Float64Builder).AppendValues([]float64{42}, nil) + bld.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true}, nil) + rec := bld.NewRecord() + bld.Release() + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + var out hugrclient.IngestResult + require.NoError(t, json.NewDecoder(resp.Body).Decode(&out)) + assert.Equal(t, int64(1), out.Inserted) +} diff --git a/integration-test/ingest/run.sh b/integration-test/ingest/run.sh new file mode 100755 index 00000000..32f92cca --- /dev/null +++ b/integration-test/ingest/run.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Run the /ipc/ingest integration tests against a fresh Postgres container. +# +# Usage: +# ./run.sh # bring up postgres, run tests, tear down +# ./run.sh keep # leave the container running after tests (for re-runs) +# +# The tests pick up the postgres DSN from INGEST_POSTGRES_DSN; if unset, this +# script populates it with the dockerized instance. + +set -euo pipefail + +HERE="$(cd "$(dirname "$0")" && pwd)" +COMPOSE_FILE="$HERE/docker-compose.yml" + +cleanup() { + if [[ "${1:-}" != "keep" ]]; then + docker compose -f "$COMPOSE_FILE" down -v + fi +} +trap 'cleanup "${1:-}"' EXIT + +docker compose -f "$COMPOSE_FILE" up -d --wait + +export INGEST_POSTGRES_DSN="postgres://test:test@127.0.0.1:5435/ingestdb" +export HUGR_INGEST_SCHEMAS_PATH="$HERE/testdata/schemas" + +cd "$HERE/../.." +go test -tags=duckdb_arrow -count=1 -v ./integration-test/ingest/... diff --git a/integration-test/ingest/testdata/init.sql b/integration-test/ingest/testdata/init.sql new file mode 100644 index 00000000..07362863 --- /dev/null +++ b/integration-test/ingest/testdata/init.sql @@ -0,0 +1,13 @@ +-- Schema used by the /ipc/ingest integration tests. +-- A single events table with a mix of scalar types and an autogen primary key +-- so the tests can also exercise "default value" behaviour (omitting the PK +-- from the Arrow stream). + +CREATE TABLE events ( + id BIGSERIAL PRIMARY KEY, + name VARCHAR NOT NULL, + value DOUBLE PRECISION NOT NULL, + is_active BOOLEAN NOT NULL DEFAULT true, + payload JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); diff --git a/integration-test/ingest/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest/testdata/schemas/pg_ingest/schema.graphql new file mode 100644 index 00000000..bdd25b85 --- /dev/null +++ b/integration-test/ingest/testdata/schemas/pg_ingest/schema.graphql @@ -0,0 +1,8 @@ +type events @table(name: "events") { + id: BigInt! @pk @default(sequence: "events_id_seq") + name: String! + value: Float! + is_active: Boolean! @default(value: "true") + payload: JSON + created_at: Timestamp @default(value: "now()") +} diff --git a/ipc-ingest.go b/ipc-ingest.go new file mode 100644 index 00000000..68eb9c27 --- /dev/null +++ b/ipc-ingest.go @@ -0,0 +1,351 @@ +package hugr + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "strings" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/google/uuid" + "github.com/hugr-lab/query-engine/pkg/auth" + "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" + "github.com/hugr-lab/query-engine/pkg/catalog/sdl" + "github.com/hugr-lab/query-engine/pkg/engines" + "github.com/hugr-lab/query-engine/pkg/perm" + "github.com/vektah/gqlparser/v2/ast" +) + +const ( + ingestContentType = "application/vnd.apache.arrow.stream" + ingestDataObjectArg = "data_object" + ingestViewNamePrefix = "_hugr_ingest_" +) + +// IngestResponse is the success payload returned by /ipc/ingest. +type IngestResponse struct { + DataObject string `json:"data_object"` + Inserted int64 `json:"inserted"` + Columns []string `json:"columns"` +} + +type ingestErrorBody struct { + Error string `json:"error"` +} + +// ipcIngestHandler accepts an Apache Arrow IPC stream in the request body and +// inserts the records into the target data object referenced by the +// `data_object` query parameter. +// +// Wire protocol (first iteration): +// - Method: POST +// - URL: /ipc/ingest?data_object= +// - Headers: Content-Type: application/vnd.apache.arrow.stream +// - Body: Arrow IPC stream (schema + record batches) +// - Response: 200 OK, JSON {"data_object": ..., "inserted": N, "columns": [...]} +// +// Restrictions intentionally enforced on this iteration (per design): +// - INSERT only (no on-conflict / merge / upsert / returning) +// - target must be a table data object (views are rejected) +// - reference fields are skipped (not insertable through this path) +// - permissions are checked against the synthetic insert mutation input +func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + if r.Method != http.MethodPost { + writeIngestError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + dataObject := r.URL.Query().Get(ingestDataObjectArg) + if dataObject == "" { + writeIngestError(w, http.StatusBadRequest, "missing data_object query parameter") + return + } + + if ct := r.Header.Get("Content-Type"); ct != "" && !strings.HasPrefix(ct, ingestContentType) { + writeIngestError(w, http.StatusUnsupportedMediaType, + fmt.Sprintf("Content-Type must be %s, got %q", ingestContentType, ct)) + return + } + + ctx := r.Context() + // Auth middleware already populated permissions; make sure we have them + // (handles direct-handler callers that bypass the middleware in tests). + if perm.PermissionsFromCtx(ctx) == nil { + newCtx, err := s.perm.ContextWithPermissions(ctx) + if err != nil { + if errors.Is(err, auth.ErrForbidden) { + writeIngestError(w, http.StatusForbidden, err.Error()) + return + } + writeIngestError(w, http.StatusInternalServerError, err.Error()) + return + } + ctx = newCtx + } + + info, mutationField, err := s.resolveIngestTarget(ctx, dataObject) + if err != nil { + writeIngestError(w, http.StatusBadRequest, err.Error()) + return + } + + eng, err := s.ds.Engine(info.Catalog) + if err != nil { + writeIngestError(w, http.StatusBadRequest, + fmt.Sprintf("engine for catalog %q not available: %v", info.Catalog, err)) + return + } + + reader, err := ipc.NewReader(r.Body, ipc.WithAllocator(memory.NewGoAllocator())) + if err != nil { + writeIngestError(w, http.StatusBadRequest, "invalid arrow stream: "+err.Error()) + return + } + defer reader.Release() + + columns, err := resolveIngestColumns(reader.Schema(), info) + if err != nil { + writeIngestError(w, http.StatusBadRequest, err.Error()) + return + } + if len(columns) == 0 { + writeIngestError(w, http.StatusBadRequest, + "no insertable columns matched between arrow stream and data object") + return + } + + if err := checkIngestPermission(ctx, info, mutationField, columns); err != nil { + if errors.Is(err, auth.ErrForbidden) { + writeIngestError(w, http.StatusForbidden, err.Error()) + return + } + writeIngestError(w, http.StatusInternalServerError, err.Error()) + return + } + + inserted, err := s.executeIngest(ctx, info, eng, reader, columns) + if err != nil { + writeIngestError(w, http.StatusInternalServerError, err.Error()) + return + } + + out := IngestResponse{ + DataObject: dataObject, + Inserted: inserted, + Columns: columnNames(columns), + } + _ = json.NewEncoder(w).Encode(out) +} + +// ingestColumn binds an Arrow input column to an SDL field of the target table. +type ingestColumn struct { + ArrowName string // column name as it appears in the incoming Arrow schema + Field *sdl.Field // resolved SDL field of the target data object +} + +func columnNames(cs []ingestColumn) []string { + out := make([]string, len(cs)) + for i, c := range cs { + out[i] = c.ArrowName + } + return out +} + +// resolveIngestTarget walks the GraphQL schema to find the target data object +// and the corresponding insert mutation field. The dataObject argument can be +// either a dotted Query path (e.g. "pg_store.public.events") or a bare GraphQL +// type name (e.g. "pg_store_public_events"). +func (s *Service) resolveIngestTarget(ctx context.Context, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { + provider := s.schema.Provider() + + var def *ast.Definition + if strings.Contains(dataObject, ".") { + queryDef := provider.ForName(ctx, base.QueryBaseName) + if queryDef == nil { + return nil, nil, fmt.Errorf("query base type not found in schema") + } + cur := queryDef + for _, part := range strings.Split(dataObject, ".") { + f := cur.Fields.ForName(part) + if f == nil { + return nil, nil, fmt.Errorf("data object %q: segment %q not found", dataObject, part) + } + cur = provider.ForName(ctx, f.Type.Name()) + if cur == nil { + return nil, nil, fmt.Errorf("data object %q: type %q not found", dataObject, f.Type.Name()) + } + } + def = cur + } else { + def = provider.ForName(ctx, dataObject) + } + if def == nil { + return nil, nil, fmt.Errorf("data object %q not found in schema", dataObject) + } + if !sdl.IsDataObject(def) { + return nil, nil, fmt.Errorf("%q is not a data object", dataObject) + } + info := sdl.DataObjectInfo(def) + if info == nil { + return nil, nil, fmt.Errorf("data object %q: no info", dataObject) + } + if info.Type != sdl.TableDataObject { + return nil, nil, fmt.Errorf("data object %q is not a table (got %q) — only tables are ingestable", dataObject, info.Type) + } + if info.Catalog == "" { + return nil, nil, fmt.Errorf("data object %q has no catalog", dataObject) + } + + // Find the insert mutation field; we need it for permission checks. + _, mutationField := sdl.ObjectMutationDefinition(ctx, provider, def, sdl.MutationTypeInsert) + if mutationField == nil { + return nil, nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) + } + return info, mutationField, nil +} + +// resolveIngestColumns maps Arrow schema fields onto SDL fields of the table. +// Reference / virtual / computed fields are intentionally rejected because +// they are not directly insertable. +func resolveIngestColumns(schema *arrow.Schema, info *sdl.Object) ([]ingestColumn, error) { + if schema == nil { + return nil, fmt.Errorf("arrow stream has no schema") + } + cols := make([]ingestColumn, 0, schema.NumFields()) + seen := map[string]struct{}{} + for _, f := range schema.Fields() { + if _, dup := seen[f.Name]; dup { + return nil, fmt.Errorf("duplicate arrow column %q", f.Name) + } + seen[f.Name] = struct{}{} + + fi := info.FieldForName(f.Name) + if fi == nil { + return nil, fmt.Errorf("column %q is not defined in data object %q", + f.Name, info.Definition().Name) + } + if fi.IsReferencesSubquery() { + return nil, fmt.Errorf("column %q is a reference and cannot be ingested directly", + f.Name) + } + if fi.IsNotDBField() { + return nil, fmt.Errorf("column %q is a computed/virtual field and cannot be ingested", + f.Name) + } + if fi.FieldSourceName("", false) == "-" { + return nil, fmt.Errorf("column %q has no database mapping", f.Name) + } + cols = append(cols, ingestColumn{ArrowName: f.Name, Field: fi}) + } + return cols, nil +} + +// checkIngestPermission verifies that the caller may invoke the insert mutation +// and write each of the supplied columns. It mirrors RolePermissions.CheckQuery +// + CheckMutationInput but operates on the synthetic per-column payload that +// an Arrow batch represents. +func checkIngestPermission(ctx context.Context, info *sdl.Object, mutationField *ast.FieldDefinition, cols []ingestColumn) error { + if auth.IsFullAccess(ctx) { + return nil + } + rp := perm.PermissionsFromCtx(ctx) + if rp == nil { + // No permissions configured = allow (matches behaviour of + // CheckQuery callers in the rest of the engine). + return nil + } + if rp.Disabled { + return auth.ErrForbidden + } + + // 1) mutation field itself must be enabled on the parent module type. + parent := "" + if mutationField != nil { + if pd, ok := mutationFieldParent(info); ok { + parent = pd + } + if _, ok := rp.Enabled(parent, mutationField.Name); !ok { + return auth.ErrForbidden + } + } + + // 2) each ingested column must be enabled on the insert input type. + inputName := info.InputInsertDataName() + if inputName == "" { + return nil + } + for _, c := range cols { + if _, ok := rp.Enabled(inputName, c.ArrowName); !ok { + return auth.ErrForbidden + } + } + return nil +} + +// mutationFieldParent returns the name of the GraphQL type that owns the +// insert mutation field for this data object. That type is the field-level +// permission scope (Permission.Object) used by RolePermissions.Enabled. +func mutationFieldParent(info *sdl.Object) (string, bool) { + mod := sdl.ObjectModule(info.Definition()) + return sdl.ModuleTypeName(mod, sdl.ModuleMutation), true +} + +// executeIngest registers the Arrow record reader as a DuckDB view and runs +// `INSERT INTO () SELECT FROM `. The view is bound +// to the underlying DuckDB driver connection and released after the INSERT +// completes (success or failure). +func (s *Service) executeIngest(ctx context.Context, info *sdl.Object, eng engines.Engine, reader *ipc.Reader, cols []ingestColumn) (int64, error) { + ar, err := s.db.Arrow(ctx) + if err != nil { + return 0, fmt.Errorf("acquire duckdb arrow conn: %w", err) + } + defer ar.Close() + + viewName := ingestViewNamePrefix + strings.ReplaceAll(uuid.NewString(), "-", "") + release, err := ar.RegisterView(reader, viewName) + if err != nil { + return 0, fmt.Errorf("register arrow view: %w", err) + } + defer release() + + sqlStr := buildIngestSQL(ctx, info, eng, cols, viewName) + res, err := ar.Exec(ctx, sqlStr) + if err != nil { + return 0, fmt.Errorf("ingest insert failed: %w", err) + } + n, _ := res.RowsAffected() + return n, nil +} + +// buildIngestSQL constructs the INSERT ... SELECT statement that drains the +// registered Arrow view into the target table. The target is fully qualified +// with the catalog (data-source) identifier so that DuckDB's postgres +// extension can route the INSERT through the attached database. +func buildIngestSQL(ctx context.Context, info *sdl.Object, eng engines.Engine, cols []ingestColumn, viewName string) string { + target := info.SQL(ctx, engines.Ident(info.Catalog)) + + colNames := make([]string, len(cols)) + selectExprs := make([]string, len(cols)) + for i, c := range cols { + colNames[i] = c.Field.FieldSourceName("", true) + selectExprs[i] = engines.Ident(c.ArrowName) + } + + return fmt.Sprintf("INSERT INTO %s (%s) SELECT %s FROM %s", + target, + strings.Join(colNames, ", "), + strings.Join(selectExprs, ", "), + engines.Ident(viewName), + ) +} + +func writeIngestError(w http.ResponseWriter, status int, msg string) { + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(ingestErrorBody{Error: msg}) +} diff --git a/pkg/db/pool.go b/pkg/db/pool.go index d41c4523..a0d88df0 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -279,6 +279,18 @@ type Arrow struct { release func() } +// Exec runs a statement on the same DuckDB driver connection that backs the +// embedded *duckdb.Arrow. This lets callers RegisterView an Arrow stream and +// then INSERT/UPDATE against the registered view on the same connection — the +// view is per-connection and is not visible to other pool connections. +func (a *Arrow) Exec(ctx context.Context, query string) (driver.Result, error) { + execer, ok := a.drv.(driver.ExecerContext) + if !ok { + return nil, fmt.Errorf("duckdb driver connection does not implement ExecerContext") + } + return execer.ExecContext(ctx, query, nil) +} + func (a *Arrow) Close() error { defer a.release() return a.drv.Close() From b267858aab130ded9c57ba93d97e5185f67feb15 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 24 May 2026 23:11:03 +0400 Subject: [PATCH 02/36] ipc ingest --- integration-test/ingest/ingest_test.go | 122 ++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 1 deletion(-) diff --git a/integration-test/ingest/ingest_test.go b/integration-test/ingest/ingest_test.go index 07c9f7dd..c613314a 100644 --- a/integration-test/ingest/ingest_test.go +++ b/integration-test/ingest/ingest_test.go @@ -7,6 +7,7 @@ import ( "context" "database/sql" "encoding/json" + "fmt" "io" "net/http" "net/http/httptest" @@ -367,9 +368,128 @@ func TestIngest_HTTP_Direct(t *testing.T) { resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", "application/vnd.apache.arrow.stream", &buf) require.NoError(t, err) - defer resp.Body.Close() require.Equal(t, http.StatusOK, resp.StatusCode) var out hugrclient.IngestResult require.NoError(t, json.NewDecoder(resp.Body).Decode(&out)) + resp.Body.Close() assert.Equal(t, int64(1), out.Inserted) + + // --- Real-world bulk path ------------------------------------------------- + // A producer (ETL/CDC/telemetry) streams many RecordBatches in a single + // Arrow IPC stream over one HTTP POST. The whole payload is never + // materialised in memory client-side — we pipe the writer goroutine + // straight into the request body. This is where /ipc/ingest pays off vs. + // GraphQL `insert_events(data: ...)` mutations. + _, err = env.pgConn.ExecContext(context.Background(), "TRUNCATE TABLE events RESTART IDENTITY") + require.NoError(t, err) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + ) + + bulkSchema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + w := ipc.NewWriter(pw, ipc.WithSchema(bulkSchema)) + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + var streamErr error + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rb := array.NewRecordBuilder(pool, bulkSchema) + names := rb.Field(0).(*array.StringBuilder) + values := rb.Field(1).(*array.Float64Builder) + active := rb.Field(2).(*array.BooleanBuilder) + payloads := rb.Field(3).(*array.StringBuilder) + ts := rb.Field(4).(*array.TimestampBuilder) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + names.Append(fmt.Sprintf("evt-%06d", row)) + values.Append(float64(row) * 0.5) + active.Append(row%2 == 0) + if row%5 == 0 { + payloads.AppendNull() + } else { + payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + ts.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + batchRec := rb.NewRecord() + rb.Release() + if werr := w.Write(batchRec); werr != nil { + streamErr = fmt.Errorf("write batch %d: %w", batchIdx, werr) + batchRec.Release() + break + } + batchRec.Release() + } + if cerr := w.Close(); cerr != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", cerr) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + start := time.Now() + bulkResp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", pr) + werr := <-writeErr + require.NoError(t, werr, "writer goroutine failed") + require.NoError(t, postErr) + require.Equal(t, http.StatusOK, bulkResp.StatusCode) + var bulkResult hugrclient.IngestResult + require.NoError(t, json.NewDecoder(bulkResp.Body).Decode(&bulkResult)) + bulkResp.Body.Close() + elapsed := time.Since(start) + assert.Equal(t, int64(totalRows), bulkResult.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, bulkResult.Columns) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + + // Spot-check a sample to confirm per-row fidelity end-to-end. + rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) + require.NoError(t, err) + defer rows.Close() + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) + for rows.Next() { + var n string + var v float64 + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"evt-000000", "evt-000001", "evt-000002", "evt-000003", "evt-000004"}, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + // row%5 == 0 ⇒ payload IS NULL; in the first five rows that's just row 0. + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + // Cross-check the active-row count to ensure the boolean column survived + // without bit-packing artefacts across batch boundaries. + var activeCount int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + + t.Logf("bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } From 4698400d04159df5d6ff3e3d82b8bbc891973c36 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 24 May 2026 23:27:03 +0400 Subject: [PATCH 03/36] ipc ingest --- integration-test/ingest/docker-compose.yml | 4 ++-- integration-test/ingest/run.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integration-test/ingest/docker-compose.yml b/integration-test/ingest/docker-compose.yml index 7db54aa7..83ac0a23 100644 --- a/integration-test/ingest/docker-compose.yml +++ b/integration-test/ingest/docker-compose.yml @@ -1,11 +1,11 @@ services: postgres: - image: postgres:16 + image: pgvector/pgvector:pg16 environment: POSTGRES_DB: ingestdb POSTGRES_USER: test POSTGRES_PASSWORD: test - ports: ["5435:5432"] + ports: ["5437:5432"] volumes: - ./testdata/init.sql:/docker-entrypoint-initdb.d/01-init.sql healthcheck: diff --git a/integration-test/ingest/run.sh b/integration-test/ingest/run.sh index 32f92cca..61d0a6d9 100755 --- a/integration-test/ingest/run.sh +++ b/integration-test/ingest/run.sh @@ -22,7 +22,7 @@ trap 'cleanup "${1:-}"' EXIT docker compose -f "$COMPOSE_FILE" up -d --wait -export INGEST_POSTGRES_DSN="postgres://test:test@127.0.0.1:5435/ingestdb" +export INGEST_POSTGRES_DSN="postgres://test:test@127.0.0.1:5437/ingestdb" export HUGR_INGEST_SCHEMAS_PATH="$HERE/testdata/schemas" cd "$HERE/../.." From e57a481afb1838fc1d4630eab090e7fae1165c33 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 25 May 2026 01:15:29 +0400 Subject: [PATCH 04/36] ipc ingest --- client/ingest.go | 292 ++++++++++++--- integration-test/ingest/ingest_test.go | 475 ++++++++++++++++++++++++- 2 files changed, 716 insertions(+), 51 deletions(-) diff --git a/client/ingest.go b/client/ingest.go index 0bffc18b..c836bdca 100644 --- a/client/ingest.go +++ b/client/ingest.go @@ -1,6 +1,7 @@ package client import ( + "bytes" "context" "encoding/json" "errors" @@ -8,11 +9,14 @@ import ( "io" "net/http" "net/url" + "os" "strings" + "sync/atomic" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" ) // IngestResult is the success payload returned by /ipc/ingest. @@ -22,20 +26,78 @@ type IngestResult struct { Columns []string `json:"columns"` } +const ingestContentType = "application/vnd.apache.arrow.stream" + +// arrowFileMagic identifies the Arrow IPC *file* format (random-access), +// distinct from the IPC *stream* format that /ipc/ingest expects on the wire. +var arrowFileMagic = []byte("ARROW1") + +// IngestStream POSTs the given Arrow IPC stream to /ipc/ingest. The body +// must already be a valid Arrow IPC stream (schema message followed by +// record batches) — typically produced by ipc.NewWriter, by another tool, +// or read from a stream-format file. +// +// The body is forwarded to the server without intermediate buffering. Use +// this when the caller already has a serialised stream from disk, the +// network, or another process. Use Ingest for the higher-level API that +// serialises an array.RecordReader for you. +func (c *Client) IngestStream(ctx context.Context, dataObject string, body io.Reader) (*IngestResult, error) { + if dataObject == "" { + return nil, errors.New("hugr ingest: data_object is required") + } + if body == nil { + return nil, errors.New("hugr ingest: body is nil") + } + endpoint, err := buildIngestURL(c.url, dataObject) + if err != nil { + return nil, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, body) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", ingestContentType) + setAsUserHeaders(ctx, req) + resp, err := c.c.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + raw, _ := io.ReadAll(resp.Body) + var ebody struct { + Error string `json:"error"` + } + _ = json.Unmarshal(raw, &ebody) + if ebody.Error == "" { + ebody.Error = strings.TrimSpace(string(raw)) + } + if ebody.Error == "" { + ebody.Error = resp.Status + } + return nil, fmt.Errorf("hugr ingest: %s: %s", resp.Status, ebody.Error) + } + + var out IngestResult + if err := json.NewDecoder(resp.Body).Decode(&out); err != nil { + return nil, fmt.Errorf("decode ingest response: %w", err) + } + return &out, nil +} + // Ingest streams the records produced by reader into the target data object. // Columns from the Arrow schema must match insertable fields of the table // (computed/virtual/reference fields are rejected by the server). // // dataObject is either a dotted GraphQL Query path (e.g. "pg_store.public.events") -// or a bare hugr type name. The client serializes the reader as an Apache Arrow -// IPC stream and POSTs it to /ipc/ingest on the configured base URL. +// or a bare hugr type name. The client serialises the reader as an Apache +// Arrow IPC stream and POSTs it to /ipc/ingest on the configured base URL. // -// The reader is fully drained on success; on error the caller may inspect the -// reader's remaining state but it should be released by the caller in all cases. +// The reader is fully drained on success; on error the caller may inspect +// the reader's remaining state but it should be released by the caller in +// all cases. func (c *Client) Ingest(ctx context.Context, dataObject string, reader array.RecordReader) (*IngestResult, error) { - if dataObject == "" { - return nil, errors.New("hugr ingest: data_object is required") - } if reader == nil { return nil, errors.New("hugr ingest: reader is nil") } @@ -68,53 +130,17 @@ func (c *Client) Ingest(ctx context.Context, dataObject string, reader array.Rec writeErr <- streamErr }() - endpoint, err := buildIngestURL(c.url, dataObject) - if err != nil { - _ = pr.Close() - return nil, err - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, pr) - if err != nil { - _ = pr.Close() - return nil, err - } - req.Header.Set("Content-Type", "application/vnd.apache.arrow.stream") - setAsUserHeaders(ctx, req) - - resp, err := c.c.Do(req) - if err != nil { - _ = pr.CloseWithError(err) - return nil, err + res, httpErr := c.IngestStream(ctx, dataObject, pr) + if httpErr != nil { + // Unblock the writer goroutine if the HTTP side aborted early. + _ = pr.CloseWithError(httpErr) } - defer resp.Body.Close() - - // Surface a writer-side error in preference to the (likely derivative) - // HTTP error. if werr := <-writeErr; werr != nil { + // Serialisation errors are more informative than the (likely + // derivative) HTTP error. return nil, werr } - - if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(resp.Body) - var ebody struct { - Error string `json:"error"` - } - _ = json.Unmarshal(body, &ebody) - if ebody.Error == "" { - ebody.Error = strings.TrimSpace(string(body)) - } - if ebody.Error == "" { - ebody.Error = resp.Status - } - return nil, fmt.Errorf("hugr ingest: %s: %s", resp.Status, ebody.Error) - } - - var out IngestResult - if err := json.NewDecoder(resp.Body).Decode(&out); err != nil { - return nil, fmt.Errorf("decode ingest response: %w", err) - } - return &out, nil + return res, httpErr } // IngestRecord is a single-batch convenience wrapper around Ingest. It builds @@ -131,6 +157,70 @@ func (c *Client) IngestRecord(ctx context.Context, dataObject string, rec arrow. return c.Ingest(ctx, dataObject, rr) } +// IngestArrowIPCFile opens an Arrow IPC file at path and streams its +// contents to /ipc/ingest. Both IPC formats are accepted: +// +// - stream format (no ARROW1 prefix) — written by ipc.NewWriter or +// pyarrow.ipc.new_stream. Bytes are forwarded directly to the server, +// zero-copy. +// - file format (.arrow / .feather, starts with ARROW1 magic) — written +// by ipc.NewFileWriter or pyarrow.feather.write_feather. The file is +// read sequentially via ipc.FileReader and re-emitted as a stream. +func (c *Client) IngestArrowIPCFile(ctx context.Context, dataObject, path string) (*IngestResult, error) { + if path == "" { + return nil, errors.New("hugr ingest: path is required") + } + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("open %s: %w", path, err) + } + defer f.Close() + + // Peek for the ARROW1 magic to decide between stream and file format. + var magic [6]byte + n, err := io.ReadFull(f, magic[:]) + if err != nil && !errors.Is(err, io.ErrUnexpectedEOF) && !errors.Is(err, io.EOF) { + return nil, fmt.Errorf("read %s: %w", path, err) + } + if n == len(arrowFileMagic) && bytes.Equal(magic[:], arrowFileMagic) { + if _, err := f.Seek(0, io.SeekStart); err != nil { + return nil, fmt.Errorf("seek %s: %w", path, err) + } + fr, err := ipc.NewFileReader(f, ipc.WithAllocator(memory.NewGoAllocator())) + if err != nil { + return nil, fmt.Errorf("open arrow ipc file %s: %w", path, err) + } + defer fr.Close() + rr := &fileReaderAsRecordReader{fr: fr} + rr.refCount.Add(1) + defer rr.Release() + return c.Ingest(ctx, dataObject, rr) + } + + // Stream format — forward bytes. Prepend the bytes we already consumed + // during magic detection. + body := io.MultiReader(bytes.NewReader(magic[:n]), f) + return c.IngestStream(ctx, dataObject, body) +} + +// NewLazyReader returns an array.RecordReader that produces batches by +// calling gen. gen should return (batch, nil) for each successive batch and +// (nil, nil) to signal end-of-stream. Returning (_, err) terminates the +// reader; the error is then visible via Err(). +// +// The reader takes ownership of each returned batch and releases it on the +// next Next() call or on the final Release. The caller must not Release +// the batch themselves after returning it from gen. +// +// Typical use: stream bulk data from any source (file, channel, generator) +// into Client.Ingest without implementing the full array.RecordReader +// interface by hand. +func NewLazyReader(schema *arrow.Schema, gen func() (arrow.RecordBatch, error)) array.RecordReader { + r := &lazyReader{schema: schema, gen: gen} + r.refCount.Add(1) + return r +} + // buildIngestURL derives the /ipc/ingest endpoint from the client's base /ipc URL. // Accepts both ".../ipc" (canonical) and ".../ipc/" forms. func buildIngestURL(base, dataObject string) (string, error) { @@ -152,3 +242,105 @@ func buildIngestURL(base, dataObject string) (string, error) { u.RawQuery = q.Encode() return u.String(), nil } + +// --- lazyReader ----------------------------------------------------------- + +type lazyReader struct { + schema *arrow.Schema + gen func() (arrow.RecordBatch, error) + + cur arrow.RecordBatch + err error + done bool + refCount atomic.Int64 +} + +func (r *lazyReader) Schema() *arrow.Schema { return r.schema } +func (r *lazyReader) Err() error { return r.err } + +func (r *lazyReader) Next() bool { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + if r.done || r.err != nil { + return false + } + rec, err := r.gen() + if err != nil { + r.err = err + r.done = true + return false + } + if rec == nil { + r.done = true + return false + } + r.cur = rec + return true +} + +func (r *lazyReader) RecordBatch() arrow.RecordBatch { return r.cur } +func (r *lazyReader) Record() arrow.RecordBatch { return r.cur } +func (r *lazyReader) Retain() { r.refCount.Add(1) } +func (r *lazyReader) Release() { + if r.refCount.Add(-1) == 0 { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + } +} + +// --- fileReaderAsRecordReader --------------------------------------------- + +// fileReaderAsRecordReader adapts an *ipc.FileReader (random-access file +// format) to the array.RecordReader interface required by Ingest. +type fileReaderAsRecordReader struct { + fr *ipc.FileReader + cur arrow.RecordBatch + err error + + refCount atomic.Int64 +} + +func (r *fileReaderAsRecordReader) Schema() *arrow.Schema { return r.fr.Schema() } +func (r *fileReaderAsRecordReader) Err() error { return r.err } + +func (r *fileReaderAsRecordReader) Next() bool { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + if r.err != nil { + return false + } + rec, err := r.fr.Read() + if errors.Is(err, io.EOF) { + return false + } + if err != nil { + r.err = err + return false + } + if rec == nil { + return false + } + // FileReader.Read documents that the record is valid until next Read. + // Retain so we own the reference until our own Next/Release. + rec.Retain() + r.cur = rec + return true +} + +func (r *fileReaderAsRecordReader) RecordBatch() arrow.RecordBatch { return r.cur } +func (r *fileReaderAsRecordReader) Record() arrow.RecordBatch { return r.cur } +func (r *fileReaderAsRecordReader) Retain() { r.refCount.Add(1) } +func (r *fileReaderAsRecordReader) Release() { + if r.refCount.Add(-1) == 0 { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + } +} diff --git a/integration-test/ingest/ingest_test.go b/integration-test/ingest/ingest_test.go index c613314a..801ecd33 100644 --- a/integration-test/ingest/ingest_test.go +++ b/integration-test/ingest/ingest_test.go @@ -7,12 +7,14 @@ import ( "context" "database/sql" "encoding/json" + "errors" "fmt" "io" "net/http" "net/http/httptest" "os" "path/filepath" + "sync/atomic" "testing" "time" @@ -269,6 +271,387 @@ func TestIngest_Postgres_MultipleBatches(t *testing.T) { assert.Equal(t, 5, count) } +// TestIngest_Postgres_Bulk exercises the typed Go client at real-world scale: +// 50 batches × 1000 rows streamed through array.RecordReader, never +// materialised in memory beyond the current batch. Mirrors the wire-level +// bulk path in TestIngest_HTTP_Direct, but goes through hugrclient.Client. +func TestIngest_Postgres_Bulk(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + ) + + reader := newLazyEventsReader( + memory.NewGoAllocator(), + numBatches, rowsPerBatch, + time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC), + ) + defer reader.Release() + + start := time.Now() + res, err := env.client.Ingest(context.Background(), "pg_ingest.events", reader) + elapsed := time.Since(start) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(totalRows), res.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, res.Columns) + + // Time the COUNT(*) immediately after the POST returns. If the server + // were lying / writing asynchronously, this query would either be slow + // (waiting for in-flight writes to land) or return a partial value. + countStart := time.Now() + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible the moment POST returns") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + // Spot-check the first five rows for per-column fidelity through the + // client → server → DuckDB → postgres-extension → Postgres pipeline. + rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) + require.NoError(t, err) + defer rows.Close() + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) + for rows.Next() { + var n string + var v float64 + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"evt-000000", "evt-000001", "evt-000002", "evt-000003", "evt-000004"}, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + // row%5 == 0 ⇒ payload IS NULL; in the first five rows that's just row 0. + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + var activeCount int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + + t.Logf("bulk ingest via Go client: %d rows in %d batches in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_Postgres_Stream covers Client.IngestStream — the low-level API +// that takes a raw Arrow IPC stream as io.Reader. We serialise a buffer +// ourselves and verify it lands in Postgres. +func TestIngest_Postgres_Stream(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + b.Field(0).(*array.StringBuilder).AppendValues([]string{"s1", "s2"}, nil) + b.Field(1).(*array.Float64Builder).AppendValues([]float64{10, 20}, nil) + b.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true, false}, nil) + rec := b.NewRecord() + b.Release() + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + res, err := env.client.IngestStream(context.Background(), "pg_ingest.events", &buf) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 2, count) +} + +// TestIngest_Postgres_Stream_Empty checks that IngestStream rejects nil body +// without sending anything to the server. +func TestIngest_Postgres_Stream_Empty(t *testing.T) { + env := setupEnv(t) + _, err := env.client.IngestStream(context.Background(), "pg_ingest.events", nil) + require.Error(t, err) + assert.Contains(t, err.Error(), "body is nil") + + _, err = env.client.IngestStream(context.Background(), "", bytes.NewReader([]byte{})) + require.Error(t, err) + assert.Contains(t, err.Error(), "data_object") +} + +// TestIngest_Postgres_ArrowIPCFile_StreamFormat writes an Arrow IPC stream +// file to disk and ingests it via IngestArrowIPCFile. The file has no +// ARROW1 magic, so the client should byte-forward it to /ipc/ingest. +func TestIngest_Postgres_ArrowIPCFile_StreamFormat(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + mk := func(names []string, vals []float64) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + b.Field(1).(*array.Float64Builder).AppendValues(vals, nil) + act := make([]bool, len(names)) + for i := range act { + act[i] = true + } + b.Field(2).(*array.BooleanBuilder).AppendValues(act, nil) + return b.NewRecord() + } + rec1 := mk([]string{"f1", "f2"}, []float64{1, 2}) + defer rec1.Release() + rec2 := mk([]string{"f3"}, []float64{3}) + defer rec2.Release() + + dir := t.TempDir() + path := filepath.Join(dir, "events_stream.arrows") + f, err := os.Create(path) + require.NoError(t, err) + w := ipc.NewWriter(f, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec1)) + require.NoError(t, w.Write(rec2)) + require.NoError(t, w.Close()) + require.NoError(t, f.Close()) + + res, err := env.client.IngestArrowIPCFile(context.Background(), "pg_ingest.events", path) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(3), res.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 3, count) + + // Verify both batches landed (ordered by name). + rows, err := env.pgConn.Query("SELECT name, value FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + var gotNames []string + var gotVals []float64 + for rows.Next() { + var n string + var v float64 + require.NoError(t, rows.Scan(&n, &v)) + gotNames = append(gotNames, n) + gotVals = append(gotVals, v) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"f1", "f2", "f3"}, gotNames) + assert.Equal(t, []float64{1, 2, 3}, gotVals) +} + +// TestIngest_Postgres_ArrowIPCFile_FileFormat writes an Arrow IPC *file* +// format (.arrow with ARROW1 magic) to disk and ingests it via +// IngestArrowIPCFile. The client should detect the magic, use FileReader, +// and re-emit as a stream to the server. +func TestIngest_Postgres_ArrowIPCFile_FileFormat(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + b.Field(0).(*array.StringBuilder).AppendValues([]string{"file1", "file2", "file3"}, nil) + b.Field(1).(*array.Float64Builder).AppendValues([]float64{100, 200, 300}, nil) + b.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true, true, false}, nil) + rec := b.NewRecord() + b.Release() + defer rec.Release() + + dir := t.TempDir() + path := filepath.Join(dir, "events_file.arrow") + f, err := os.Create(path) + require.NoError(t, err) + fw, err := ipc.NewFileWriter(f, ipc.WithSchema(schema)) + require.NoError(t, err) + require.NoError(t, fw.Write(rec)) + require.NoError(t, fw.Close()) + require.NoError(t, f.Close()) + + // Sanity-check that we actually wrote the file format (ARROW1 prefix). + prefix, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(prefix), 6) + assert.Equal(t, "ARROW1", string(prefix[:6]), "test setup must produce file format with ARROW1 magic") + + res, err := env.client.IngestArrowIPCFile(context.Background(), "pg_ingest.events", path) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(3), res.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 3, count) + + rows, err := env.pgConn.Query("SELECT name, value FROM events ORDER BY value") + require.NoError(t, err) + defer rows.Close() + var gotNames []string + for rows.Next() { + var n string + var v float64 + require.NoError(t, rows.Scan(&n, &v)) + gotNames = append(gotNames, n) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"file1", "file2", "file3"}, gotNames) +} + +// TestIngest_Postgres_ArrowIPCFile_NotFound checks that a missing file +// surfaces a clean error without touching the server. +func TestIngest_Postgres_ArrowIPCFile_NotFound(t *testing.T) { + env := setupEnv(t) + _, err := env.client.IngestArrowIPCFile(context.Background(), "pg_ingest.events", + filepath.Join(t.TempDir(), "does-not-exist.arrows")) + require.Error(t, err) +} + +// TestIngest_Postgres_LazyReader exercises NewLazyReader at scale: 50×1000 +// rows generated on demand by a closure, no boilerplate RecordReader +// implementation. Mirrors TestIngest_Postgres_Bulk to prove the helper is +// equivalent. +func TestIngest_Postgres_LazyReader(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + ) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + batchIdx := 0 + reader := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if batchIdx >= numBatches { + return nil, nil + } + rb := array.NewRecordBuilder(pool, schema) + defer rb.Release() + names := rb.Field(0).(*array.StringBuilder) + values := rb.Field(1).(*array.Float64Builder) + active := rb.Field(2).(*array.BooleanBuilder) + payloads := rb.Field(3).(*array.StringBuilder) + ts := rb.Field(4).(*array.TimestampBuilder) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + names.Append(fmt.Sprintf("lz-%06d", row)) + values.Append(float64(row) * 0.5) + active.Append(row%2 == 0) + if row%5 == 0 { + payloads.AppendNull() + } else { + payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + ts.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + rec := rb.NewRecord() + batchIdx++ + return rec, nil + }) + defer reader.Release() + + start := time.Now() + res, err := env.client.Ingest(context.Background(), "pg_ingest.events", reader) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + + t.Logf("lazy-reader bulk ingest: %d rows in %d batches in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_LazyReader_Termination is a unit-style test for NewLazyReader's +// termination semantics (no server / postgres needed): (nil, nil) ends the +// stream; (_, err) surfaces via Err(). +func TestIngest_LazyReader_Termination(t *testing.T) { + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + mk := func(v int32) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).Append(v) + return b.NewRecord() + } + + // Case 1: gen returns batches then nil — clean end-of-stream. + { + i := 0 + r := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i >= 3 { + return nil, nil + } + i++ + return mk(int32(i)), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + require.NoError(t, r.Err()) + assert.Equal(t, 3, seen) + assert.False(t, r.Next(), "Next after end-of-stream stays false") + } + + // Case 2: gen returns an error — surfaces via Err, terminates stream. + { + errBoom := errors.New("boom") + i := 0 + r := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i == 2 { + return nil, errBoom + } + i++ + return mk(int32(i)), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + assert.Equal(t, 2, seen, "should yield batches before the failing call") + require.Error(t, r.Err()) + assert.ErrorIs(t, r.Err(), errBoom) + } +} + func TestIngest_Postgres_UnknownColumn(t *testing.T) { env := setupEnv(t) @@ -453,9 +836,15 @@ func TestIngest_HTTP_Direct(t *testing.T) { assert.Equal(t, int64(totalRows), bulkResult.Inserted) assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, bulkResult.Columns) + // Time the COUNT(*) right after the POST returns to prove the writes are + // synchronous: if the server reported "inserted" before the data was + // actually committed to Postgres, COUNT(*) would either lag or be partial. + countStart := time.Now() var count int require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) - assert.Equal(t, totalRows, count) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible the moment POST returns") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) // Spot-check a sample to confirm per-row fidelity end-to-end. rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) @@ -493,3 +882,87 @@ func TestIngest_HTTP_Direct(t *testing.T) { t.Logf("bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } + +// lazyEventsReader is an array.RecordReader that generates events-table +// RecordBatches on demand. This is the shape of a real-world Arrow producer +// (parquet scanner, CDC tap, kafka batcher) — the whole stream is never +// materialised in memory beyond the batch currently being consumed. +type lazyEventsReader struct { + pool memory.Allocator + schema *arrow.Schema + numBatches int + rowsPerBatch int + base time.Time + + batchIdx int + current arrow.RecordBatch + err error + refCount atomic.Int64 +} + +func newLazyEventsReader(pool memory.Allocator, numBatches, rowsPerBatch int, base time.Time) *lazyEventsReader { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + r := &lazyEventsReader{ + pool: pool, + schema: schema, + numBatches: numBatches, + rowsPerBatch: rowsPerBatch, + base: base, + } + r.refCount.Add(1) + return r +} + +func (r *lazyEventsReader) Schema() *arrow.Schema { return r.schema } +func (r *lazyEventsReader) Err() error { return r.err } + +func (r *lazyEventsReader) Next() bool { + if r.current != nil { + r.current.Release() + r.current = nil + } + if r.batchIdx >= r.numBatches { + return false + } + rb := array.NewRecordBuilder(r.pool, r.schema) + defer rb.Release() + names := rb.Field(0).(*array.StringBuilder) + values := rb.Field(1).(*array.Float64Builder) + active := rb.Field(2).(*array.BooleanBuilder) + payloads := rb.Field(3).(*array.StringBuilder) + ts := rb.Field(4).(*array.TimestampBuilder) + for i := 0; i < r.rowsPerBatch; i++ { + row := r.batchIdx*r.rowsPerBatch + i + names.Append(fmt.Sprintf("evt-%06d", row)) + values.Append(float64(row) * 0.5) + active.Append(row%2 == 0) + if row%5 == 0 { + payloads.AppendNull() + } else { + payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + ts.Append(arrow.Timestamp(r.base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + r.current = rb.NewRecord() + r.batchIdx++ + return true +} + +func (r *lazyEventsReader) RecordBatch() arrow.RecordBatch { return r.current } +func (r *lazyEventsReader) Record() arrow.RecordBatch { return r.current } + +func (r *lazyEventsReader) Retain() { r.refCount.Add(1) } +func (r *lazyEventsReader) Release() { + if r.refCount.Add(-1) == 0 { + if r.current != nil { + r.current.Release() + r.current = nil + } + } +} From 61f4d2b49d341f61ffa6b80e683b86141d96f8fe Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 25 May 2026 01:29:35 +0400 Subject: [PATCH 05/36] ipc ingest --- integration-test/ingest/ingest_test.go | 246 +++++++++++++++++-------- 1 file changed, 170 insertions(+), 76 deletions(-) diff --git a/integration-test/ingest/ingest_test.go b/integration-test/ingest/ingest_test.go index 801ecd33..a3970c46 100644 --- a/integration-test/ingest/ingest_test.go +++ b/integration-test/ingest/ingest_test.go @@ -393,130 +393,224 @@ func TestIngest_Postgres_Stream_Empty(t *testing.T) { assert.Contains(t, err.Error(), "data_object") } -// TestIngest_Postgres_ArrowIPCFile_StreamFormat writes an Arrow IPC stream -// file to disk and ingests it via IngestArrowIPCFile. The file has no -// ARROW1 magic, so the client should byte-forward it to /ipc/ingest. -func TestIngest_Postgres_ArrowIPCFile_StreamFormat(t *testing.T) { - env := setupEnv(t) +// arrowFileFormat picks between Arrow IPC stream (no magic) and Arrow IPC +// file (ARROW1 prefix) for the writeEventsArrowFile helper. +type arrowFileFormat int - pool := memory.NewGoAllocator() - schema := arrow.NewSchema([]arrow.Field{ +const ( + arrowStreamFormat arrowFileFormat = iota + arrowFileFmt +) + +func eventsArrowSchema() *arrow.Schema { + return arrow.NewSchema([]arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, }, nil) - mk := func(names []string, vals []float64) arrow.RecordBatch { - b := array.NewRecordBuilder(pool, schema) - defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues(names, nil) - b.Field(1).(*array.Float64Builder).AppendValues(vals, nil) - act := make([]bool, len(names)) - for i := range act { - act[i] = true - } - b.Field(2).(*array.BooleanBuilder).AppendValues(act, nil) - return b.NewRecord() - } - rec1 := mk([]string{"f1", "f2"}, []float64{1, 2}) - defer rec1.Release() - rec2 := mk([]string{"f3"}, []float64{3}) - defer rec2.Release() +} + +// writeEventsArrowFile produces an Arrow IPC file at path in the given +// format with numBatches × rowsPerBatch synthetic events rows. namePrefix is +// embedded in the `name` column so different tests can write to the same +// table without colliding on uniqueness assertions. +func writeEventsArrowFile(t *testing.T, path, namePrefix string, format arrowFileFormat, numBatches, rowsPerBatch int) { + t.Helper() + pool := memory.NewGoAllocator() + schema := eventsArrowSchema() - dir := t.TempDir() - path := filepath.Join(dir, "events_stream.arrows") f, err := os.Create(path) require.NoError(t, err) - w := ipc.NewWriter(f, ipc.WithSchema(schema)) - require.NoError(t, w.Write(rec1)) - require.NoError(t, w.Write(rec2)) + defer f.Close() + + type writer interface { + Write(arrow.RecordBatch) error + Close() error + } + var w writer + switch format { + case arrowStreamFormat: + w = ipc.NewWriter(f, ipc.WithSchema(schema)) + case arrowFileFmt: + fw, ferr := ipc.NewFileWriter(f, ipc.WithSchema(schema)) + require.NoError(t, ferr) + w = fw + default: + t.Fatalf("unknown arrow file format: %d", format) + } + + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rb := array.NewRecordBuilder(pool, schema) + names := rb.Field(0).(*array.StringBuilder) + values := rb.Field(1).(*array.Float64Builder) + active := rb.Field(2).(*array.BooleanBuilder) + payloads := rb.Field(3).(*array.StringBuilder) + ts := rb.Field(4).(*array.TimestampBuilder) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + names.Append(fmt.Sprintf("%s-%06d", namePrefix, row)) + values.Append(float64(row) * 0.5) + active.Append(row%2 == 0) + if row%5 == 0 { + payloads.AppendNull() + } else { + payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + ts.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + rec := rb.NewRecord() + rb.Release() + require.NoError(t, w.Write(rec)) + rec.Release() + } require.NoError(t, w.Close()) - require.NoError(t, f.Close()) +} +// TestIngest_Postgres_ArrowIPCFile_StreamFormat builds a 50×1000-row Arrow +// IPC *stream* file on disk and ingests it via IngestArrowIPCFile. The +// client should detect "no ARROW1 magic" and byte-forward the file body +// straight into /ipc/ingest — the bulk path with zero re-serialisation. +func TestIngest_Postgres_ArrowIPCFile_StreamFormat(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "stream" + ) + + path := filepath.Join(t.TempDir(), "events_stream.arrows") + writeEventsArrowFile(t, path, namePrefix, arrowStreamFormat, numBatches, rowsPerBatch) + + // Sanity-check that the file is actually stream format (no ARROW1). + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.NotEqual(t, "ARROW1", string(head[:6]), "test setup must produce stream format (no ARROW1 magic)") + + start := time.Now() res, err := env.client.IngestArrowIPCFile(context.Background(), "pg_ingest.events", path) + elapsed := time.Since(start) require.NoError(t, err) require.NotNil(t, res) - assert.Equal(t, int64(3), res.Inserted) + assert.Equal(t, int64(totalRows), res.Inserted) + // Synchronicity check: COUNT(*) must see all rows the moment POST returns. + countStart := time.Now() var count int require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) - assert.Equal(t, 3, count) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible immediately") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) - // Verify both batches landed (ordered by name). - rows, err := env.pgConn.Query("SELECT name, value FROM events ORDER BY name") + // Spot-check the first 5 rows by content (rows produced by namePrefix-N). + rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) require.NoError(t, err) defer rows.Close() - var gotNames []string - var gotVals []float64 + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) for rows.Next() { var n string var v float64 - require.NoError(t, rows.Scan(&n, &v)) - gotNames = append(gotNames, n) - gotVals = append(gotVals, v) + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) } require.NoError(t, rows.Err()) - assert.Equal(t, []string{"f1", "f2", "f3"}, gotNames) - assert.Equal(t, []float64{1, 2, 3}, gotVals) + assert.Equal(t, []string{namePrefix + "-000000", namePrefix + "-000001", namePrefix + "-000002", namePrefix + "-000003", namePrefix + "-000004"}, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + // Active-row count guards against bit-packing artefacts across batches. + var activeCount int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + + t.Logf("arrow ipc stream file ingest: %d rows from %d-batch file in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } -// TestIngest_Postgres_ArrowIPCFile_FileFormat writes an Arrow IPC *file* -// format (.arrow with ARROW1 magic) to disk and ingests it via -// IngestArrowIPCFile. The client should detect the magic, use FileReader, -// and re-emit as a stream to the server. +// TestIngest_Postgres_ArrowIPCFile_FileFormat builds a 50×1000-row Arrow IPC +// *file* format file (ARROW1 magic + random-access footer) on disk and +// ingests it via IngestArrowIPCFile. The client should detect the magic, +// open the file with ipc.FileReader, and re-emit as a stream to the server. func TestIngest_Postgres_ArrowIPCFile_FileFormat(t *testing.T) { env := setupEnv(t) - pool := memory.NewGoAllocator() - schema := arrow.NewSchema([]arrow.Field{ - {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, - {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, - {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, - }, nil) - b := array.NewRecordBuilder(pool, schema) - b.Field(0).(*array.StringBuilder).AppendValues([]string{"file1", "file2", "file3"}, nil) - b.Field(1).(*array.Float64Builder).AppendValues([]float64{100, 200, 300}, nil) - b.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true, true, false}, nil) - rec := b.NewRecord() - b.Release() - defer rec.Release() + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "file" + ) - dir := t.TempDir() - path := filepath.Join(dir, "events_file.arrow") - f, err := os.Create(path) - require.NoError(t, err) - fw, err := ipc.NewFileWriter(f, ipc.WithSchema(schema)) - require.NoError(t, err) - require.NoError(t, fw.Write(rec)) - require.NoError(t, fw.Close()) - require.NoError(t, f.Close()) + path := filepath.Join(t.TempDir(), "events_file.arrow") + writeEventsArrowFile(t, path, namePrefix, arrowFileFmt, numBatches, rowsPerBatch) // Sanity-check that we actually wrote the file format (ARROW1 prefix). - prefix, err := os.ReadFile(path) + head, err := os.ReadFile(path) require.NoError(t, err) - require.GreaterOrEqual(t, len(prefix), 6) - assert.Equal(t, "ARROW1", string(prefix[:6]), "test setup must produce file format with ARROW1 magic") + require.GreaterOrEqual(t, len(head), 6) + assert.Equal(t, "ARROW1", string(head[:6]), "test setup must produce file format with ARROW1 magic") + start := time.Now() res, err := env.client.IngestArrowIPCFile(context.Background(), "pg_ingest.events", path) + elapsed := time.Since(start) require.NoError(t, err) require.NotNil(t, res) - assert.Equal(t, int64(3), res.Inserted) + assert.Equal(t, int64(totalRows), res.Inserted) + // Synchronicity check. + countStart := time.Now() var count int require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) - assert.Equal(t, 3, count) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible immediately") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) - rows, err := env.pgConn.Query("SELECT name, value FROM events ORDER BY value") + rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) require.NoError(t, err) defer rows.Close() - var gotNames []string + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) for rows.Next() { var n string var v float64 - require.NoError(t, rows.Scan(&n, &v)) - gotNames = append(gotNames, n) + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) } require.NoError(t, rows.Err()) - assert.Equal(t, []string{"file1", "file2", "file3"}, gotNames) + assert.Equal(t, []string{namePrefix + "-000000", namePrefix + "-000001", namePrefix + "-000002", namePrefix + "-000003", namePrefix + "-000004"}, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + var activeCount int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + + t.Logf("arrow ipc file-format ingest: %d rows from %d-batch file in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } // TestIngest_Postgres_ArrowIPCFile_NotFound checks that a missing file From b7764df443f71e69b336c28a57f66d593df05137 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 31 May 2026 13:02:27 +0400 Subject: [PATCH 06/36] ipc ingest --- client/ingest_test.go | 332 +++++++ .../ingest-duckdb/ingest_duckdb_test.go | 857 ++++++++++++++++++ .../schemas/duck_ingest/schema.graphql | 8 + 3 files changed, 1197 insertions(+) create mode 100644 integration-test/ingest-duckdb/ingest_duckdb_test.go create mode 100644 integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql diff --git a/client/ingest_test.go b/client/ingest_test.go index 338ffcda..5559ad84 100644 --- a/client/ingest_test.go +++ b/client/ingest_test.go @@ -8,7 +8,10 @@ import ( "io" "net/http" "net/http/httptest" + "os" + "path/filepath" "strings" + "sync/atomic" "testing" "github.com/apache/arrow-go/v18/arrow" @@ -19,6 +22,64 @@ import ( "github.com/stretchr/testify/require" ) +// --- shared helpers ------------------------------------------------------- + +// ingestOKHandler is a server that decodes the incoming Arrow IPC stream, +// counts rows, and answers with a canonical IngestResult. The decoded +// schema's column names are returned in the response so tests can assert +// per-column fidelity. +func ingestOKHandler(t *testing.T, pool memory.Allocator) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + if r.URL.Path != "/ipc/ingest" { + t.Errorf("expected /ipc/ingest, got %s", r.URL.Path) + } + if ct := r.Header.Get("Content-Type"); !strings.HasPrefix(ct, "application/vnd.apache.arrow.stream") { + t.Errorf("unexpected content-type: %s", ct) + } + body, err := io.ReadAll(r.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + rr, err := ipc.NewReader(bytes.NewReader(body), ipc.WithAllocator(pool)) + if err != nil { + t.Fatalf("decode body as arrow stream: %v", err) + } + defer rr.Release() + var rows int64 + var cols []string + for _, f := range rr.Schema().Fields() { + cols = append(cols, f.Name) + } + for rr.Next() { + rows += rr.RecordBatch().NumRows() + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{ + "data_object": r.URL.Query().Get("data_object"), + "inserted": rows, + "columns": cols, + }) + } +} + +// smallRecord builds a single 2-row record with an int32 + string column. +func smallRecord(t *testing.T, pool memory.Allocator) arrow.RecordBatch { + t.Helper() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "id", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: true}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{10, 20}, nil) + b.Field(1).(*array.StringBuilder).AppendValues([]string{"alpha", "beta"}, nil) + return b.NewRecord() +} + func TestBuildIngestURL(t *testing.T) { tests := []struct { name string @@ -170,3 +231,274 @@ func TestIngest_EmptyDataObject(t *testing.T) { _, err := c.IngestRecord(context.Background(), "", nil) require.Error(t, err) } + +func TestIngest_ServerErrorTextBody(t *testing.T) { + // 4xx with a non-JSON body — error message must still be surfaced. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + w.WriteHeader(http.StatusUnsupportedMediaType) + _, _ = w.Write([]byte("Content-Type must be application/vnd.apache.arrow.stream")) + })) + t.Cleanup(srv.Close) + + pool := memory.NewGoAllocator() + rec := smallRecord(t, pool) + defer rec.Release() + c := NewClient(srv.URL + "/ipc") + _, err := c.IngestRecord(context.Background(), "ns.x", rec) + require.Error(t, err) + assert.Contains(t, err.Error(), "Content-Type must be") +} + +func TestIngest_WriterErrorWinsOverHTTP(t *testing.T) { + // reader.Err() returns a non-nil error AFTER yielding one good batch. + // The HTTP side will see EOF / truncated stream and may respond 4xx; + // the client must surface the writer-side error, not the HTTP one. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Drain body to unblock the writer, then respond with a generic 500 + // so we can confirm the client prefers writer error over this. + _, _ = io.Copy(io.Discard, r.Body) + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte("server failed")) + })) + t.Cleanup(srv.Close) + + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + pool := memory.NewGoAllocator() + errBoom := errors.New("reader source explosion") + calls := 0 + reader := NewLazyReader(schema, func() (arrow.RecordBatch, error) { + calls++ + if calls == 1 { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).Append(1) + return b.NewRecord(), nil + } + return nil, errBoom + }) + defer reader.Release() + + c := NewClient(srv.URL + "/ipc") + _, err := c.Ingest(context.Background(), "ns.x", reader) + require.Error(t, err) + assert.Contains(t, err.Error(), "reader source explosion", + "writer-side error must be surfaced, got: %v", err) +} + +// --- IngestStream --------------------------------------------------------- + +func TestIngestStream_Happy(t *testing.T) { + pool := memory.NewGoAllocator() + srv := httptest.NewServer(ingestOKHandler(t, pool)) + t.Cleanup(srv.Close) + + rec := smallRecord(t, pool) + defer rec.Release() + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(rec.Schema())) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + c := NewClient(srv.URL + "/ipc") + res, err := c.IngestStream(context.Background(), "ns.t", &buf) + require.NoError(t, err) + assert.Equal(t, int64(2), res.Inserted) + assert.Equal(t, "ns.t", res.DataObject) + assert.ElementsMatch(t, []string{"id", "name"}, res.Columns) +} + +func TestIngestStream_NilBody(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestStream(context.Background(), "ns.t", nil) + require.Error(t, err) + assert.Contains(t, err.Error(), "body is nil") +} + +func TestIngestStream_EmptyDataObject(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestStream(context.Background(), "", bytes.NewReader(nil)) + require.Error(t, err) + assert.Contains(t, err.Error(), "data_object") +} + +func TestIngestStream_ServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "invalid arrow stream"}) + })) + t.Cleanup(srv.Close) + c := NewClient(srv.URL + "/ipc") + _, err := c.IngestStream(context.Background(), "ns.t", bytes.NewReader([]byte("not arrow"))) + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid arrow stream") +} + +// --- IngestArrowIPCFile --------------------------------------------------- + +func writeArrowStreamFile(t *testing.T, dir string, pool memory.Allocator) (string, *arrow.Schema) { + t.Helper() + rec := smallRecord(t, pool) + defer rec.Release() + path := filepath.Join(dir, "data.arrows") + f, err := os.Create(path) + require.NoError(t, err) + w := ipc.NewWriter(f, ipc.WithSchema(rec.Schema())) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + require.NoError(t, f.Close()) + return path, rec.Schema() +} + +func writeArrowIPCFile(t *testing.T, dir string, pool memory.Allocator) (string, *arrow.Schema) { + t.Helper() + rec := smallRecord(t, pool) + defer rec.Release() + path := filepath.Join(dir, "data.arrow") + f, err := os.Create(path) + require.NoError(t, err) + fw, err := ipc.NewFileWriter(f, ipc.WithSchema(rec.Schema())) + require.NoError(t, err) + require.NoError(t, fw.Write(rec)) + require.NoError(t, fw.Close()) + require.NoError(t, f.Close()) + return path, rec.Schema() +} + +func TestIngestArrowIPCFile_StreamFormat(t *testing.T) { + pool := memory.NewGoAllocator() + srv := httptest.NewServer(ingestOKHandler(t, pool)) + t.Cleanup(srv.Close) + + path, _ := writeArrowStreamFile(t, t.TempDir(), pool) + // Sanity-check: file is *stream* format (no ARROW1 magic). + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.NotEqual(t, "ARROW1", string(head[:6])) + + c := NewClient(srv.URL + "/ipc") + res, err := c.IngestArrowIPCFile(context.Background(), "ns.t", path) + require.NoError(t, err) + assert.Equal(t, int64(2), res.Inserted) +} + +func TestIngestArrowIPCFile_FileFormat(t *testing.T) { + pool := memory.NewGoAllocator() + srv := httptest.NewServer(ingestOKHandler(t, pool)) + t.Cleanup(srv.Close) + + path, _ := writeArrowIPCFile(t, t.TempDir(), pool) + // Sanity-check: file is *file* format (ARROW1 magic). + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.Equal(t, "ARROW1", string(head[:6])) + + c := NewClient(srv.URL + "/ipc") + res, err := c.IngestArrowIPCFile(context.Background(), "ns.t", path) + require.NoError(t, err) + assert.Equal(t, int64(2), res.Inserted) +} + +func TestIngestArrowIPCFile_NotFound(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestArrowIPCFile(context.Background(), "ns.t", + filepath.Join(t.TempDir(), "does-not-exist.arrows")) + require.Error(t, err) +} + +func TestIngestArrowIPCFile_EmptyPath(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestArrowIPCFile(context.Background(), "ns.t", "") + require.Error(t, err) + assert.Contains(t, err.Error(), "path is required") +} + +// --- NewLazyReader -------------------------------------------------------- + +func TestNewLazyReader_CompletesOnNilNil(t *testing.T) { + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + i := 0 + r := NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i >= 3 { + return nil, nil // signal end-of-stream + } + i++ + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).Append(int32(i)) + return b.NewRecord(), nil + }) + defer r.Release() + + assert.Equal(t, schema, r.Schema()) + seen := 0 + for r.Next() { + require.NotNil(t, r.RecordBatch()) + assert.Equal(t, int64(1), r.RecordBatch().NumRows()) + seen++ + } + require.NoError(t, r.Err()) + assert.Equal(t, 3, seen) + assert.False(t, r.Next(), "Next stays false after end-of-stream") +} + +func TestNewLazyReader_PropagatesError(t *testing.T) { + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + errBoom := errors.New("source failure") + i := 0 + r := NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i == 2 { + return nil, errBoom + } + i++ + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).Append(int32(i)) + return b.NewRecord(), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + assert.Equal(t, 2, seen, "should yield batches before the failing call") + require.Error(t, r.Err()) + assert.ErrorIs(t, r.Err(), errBoom) + assert.False(t, r.Next(), "Next stays false after error") +} + +func TestNewLazyReader_RetainReleaseRefcount(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + r := NewLazyReader(schema, func() (arrow.RecordBatch, error) { return nil, nil }) + // initial refCount = 1 (set by constructor) + rc := refCountOf(t, r) + assert.Equal(t, int64(1), rc.Load()) + r.Retain() + assert.Equal(t, int64(2), rc.Load()) + r.Release() + assert.Equal(t, int64(1), rc.Load()) + r.Release() + assert.Equal(t, int64(0), rc.Load()) +} + +// refCountOf reaches into the concrete *lazyReader to verify retain/release +// semantics. Test-only — the field is unexported on purpose. +func refCountOf(t *testing.T, r array.RecordReader) *atomic.Int64 { + t.Helper() + lr, ok := r.(*lazyReader) + require.True(t, ok, "expected *lazyReader, got %T", r) + return &lr.refCount +} diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go new file mode 100644 index 00000000..9528bbe7 --- /dev/null +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -0,0 +1,857 @@ +//go:build duckdb_arrow + +package ingest_duckdb_test + +import ( + "bytes" + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "sync/atomic" + "testing" + "time" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + _ "github.com/duckdb/duckdb-go/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + hugr "github.com/hugr-lab/query-engine" + hugrclient "github.com/hugr-lab/query-engine/client" + "github.com/hugr-lab/query-engine/pkg/auth" + coredb "github.com/hugr-lab/query-engine/pkg/data-sources/sources/runtime/core-db" + "github.com/hugr-lab/query-engine/pkg/db" +) + +// ingestEnv mirrors the Postgres counterpart but everything runs in-process +// against a local .duckdb file in t.TempDir(): +// - the .duckdb file is created and seeded (CREATE SEQUENCE + CREATE TABLE) +// via a direct sql.Open("duckdb", path) connection; +// - that connection is then CLOSED — only one process can hold the write +// lock, and we hand the file over to hugr next; +// - hugr is started in-process, registers the file as a "duckdb" data +// source named "duck_ingest" and ATTACHes it; +// - the verifier sql.DB is re-opened in READ_ONLY mode (DuckDB allows +// concurrent read-only connections while another writer holds the file), +// giving the test an independent view of the data — analogous to the +// pgx-based pgConn in the Postgres test suite. +type ingestEnv struct { + service *hugr.Service + server *httptest.Server + client *hugrclient.Client + dbPath string +} + +// openRO returns a fresh READ_ONLY sql.DB handle to the events database. +// DuckDB RO connections opened in the same process as a writer DO NOT +// transparently refresh snapshot across pooled connections, so we open a +// fresh handle per verification — this gives us a guaranteed post-write +// snapshot at the moment of the assertion. Callers should `defer Close()`. +func (e *ingestEnv) openRO(t *testing.T) *sql.DB { + t.Helper() + conn, err := sql.Open("duckdb", e.dbPath+"?access_mode=read_only") + require.NoError(t, err) + require.NoError(t, conn.PingContext(context.Background())) + return conn +} + +func setupEnv(t *testing.T) *ingestEnv { + t.Helper() + ctx := context.Background() + + dbPath := filepath.Join(t.TempDir(), "test.duckdb") + + // 1. Seed schema with a private writer; close before hugr opens it. + seed, err := sql.Open("duckdb", dbPath) + require.NoError(t, err) + _, err = seed.ExecContext(ctx, ` + CREATE SEQUENCE events_id_seq; + CREATE TABLE events ( + id BIGINT PRIMARY KEY DEFAULT nextval('events_id_seq'), + name VARCHAR NOT NULL, + value DOUBLE NOT NULL, + is_active BOOLEAN NOT NULL DEFAULT true, + payload JSON, + created_at TIMESTAMPTZ NOT NULL DEFAULT now() + ); + `) + require.NoError(t, err) + require.NoError(t, seed.Close()) + + // 2. Schema path for the localFS catalog. + schemaDir, err := filepath.Abs(filepath.Join("testdata", "schemas", "duck_ingest")) + require.NoError(t, err) + require.DirExists(t, schemaDir) + + // 3. Start hugr in-process. + service, err := hugr.New(hugr.Config{ + Debug: true, + DB: db.Config{}, + CoreDB: coredb.New(coredb.Config{}), + Auth: &auth.Config{ + Providers: []auth.AuthProvider{ + auth.NewAnonymous(auth.AnonymousConfig{ + Allowed: true, + Role: "admin", + }), + }, + }, + }) + require.NoError(t, err) + require.NoError(t, service.Init(ctx)) + + // 4. Register & load the duckdb data source pointed at the file. + mustQuery(t, ctx, service, `mutation($data: core_data_sources_mut_input_data!) { + core { insert_data_sources(data: $data) { name } } + }`, map[string]any{ + "data": map[string]any{ + "name": "duck_ingest", + "type": "duckdb", + "prefix": "duck_ingest", + "as_module": true, + "path": dbPath, + "catalogs": []map[string]any{{ + "name": "duck_ingest", + "type": "localFS", + "path": schemaDir, + }}, + }, + }) + mustQuery(t, ctx, service, `mutation { function { core { load_data_source(name: "duck_ingest") { success message } } } }`, nil) + + srv := httptest.NewServer(service) + + c := hugrclient.NewClient(srv.URL + "/ipc") + + env := &ingestEnv{ + service: service, + server: srv, + client: c, + dbPath: dbPath, + } + t.Cleanup(func() { + srv.Close() + service.Close() + }) + return env +} + +func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, vars map[string]any) { + t.Helper() + res, err := s.Query(ctx, q, vars) + require.NoError(t, err) + if res.Err() != nil { + require.NoErrorf(t, res.Err(), "graphql error for query: %s", q) + } + res.Close() +} + +func makeEventsRecord(t *testing.T, names []string, values []float64, active []bool, payload []string, created []arrow.Timestamp) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + b.Field(1).(*array.Float64Builder).AppendValues(values, nil) + b.Field(2).(*array.BooleanBuilder).AppendValues(active, nil) + pBuilder := b.Field(3).(*array.StringBuilder) + for _, p := range payload { + if p == "" { + pBuilder.AppendNull() + } else { + pBuilder.Append(p) + } + } + tsBuilder := b.Field(4).(*array.TimestampBuilder) + tsBuilder.AppendValues(created, nil) + return b.NewRecord() +} + +// --- Core tests ----------------------------------------------------------- + +func TestIngest_DuckDB_RoundTrip(t *testing.T) { + env := setupEnv(t) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"alpha", "beta", "gamma"}, + []float64{1.5, 2.5, 3.5}, + []bool{true, false, true}, + []string{`{"k":"v"}`, "", `{"x":1}`}, + []arrow.Timestamp{now, now, now}, + ) + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "duck_ingest.events", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, "duck_ingest.events", res.DataObject) + assert.Equal(t, int64(3), res.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, res.Columns) + + // Verify via a fresh READ_ONLY verifier connection (independent of hugr's + // session). Open a new handle per verification to guarantee a post-write + // snapshot — see openRO doc. + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 3, count) + + ro2 := env.openRO(t) + defer ro2.Close() + rows, err := ro2.Query("SELECT name, value, is_active, payload IS NOT NULL FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + var ( + gotNames []string + gotValues []float64 + gotActive []bool + gotHasJSON []bool + ) + for rows.Next() { + var n string + var v float64 + var a, j bool + require.NoError(t, rows.Scan(&n, &v, &a, &j)) + gotNames = append(gotNames, n) + gotValues = append(gotValues, v) + gotActive = append(gotActive, a) + gotHasJSON = append(gotHasJSON, j) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"alpha", "beta", "gamma"}, gotNames) + assert.Equal(t, []float64{1.5, 2.5, 3.5}, gotValues) + assert.Equal(t, []bool{true, false, true}, gotActive) + assert.Equal(t, []bool{true, false, true}, gotHasJSON) +} + +func TestIngest_DuckDB_UnknownColumn(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "not_a_column", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).AppendValues([]string{"x"}, nil) + b.Field(1).(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecord() + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), "duck_ingest.events", rec) + require.Error(t, err) + assert.Contains(t, err.Error(), "not_a_column") + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 0, count, "no rows should have been inserted on validation failure") +} + +func TestIngest_DuckDB_UnknownDataObject(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecord() + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), "duck_ingest.does_not_exist", rec) + require.Error(t, err) +} + +func TestIngest_DuckDB_MultipleBatches(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + mk := func(names []string) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + vals := make([]float64, len(names)) + for i := range vals { + vals[i] = float64(i) + } + b.Field(1).(*array.Float64Builder).AppendValues(vals, nil) + active := make([]bool, len(names)) + for i := range active { + active[i] = true + } + b.Field(2).(*array.BooleanBuilder).AppendValues(active, nil) + b.Field(3).(*array.StringBuilder).AppendNulls(len(names)) + ts := make([]arrow.Timestamp, len(names)) + for i := range ts { + ts[i] = arrow.Timestamp(time.Now().UTC().UnixMicro()) + } + b.Field(4).(*array.TimestampBuilder).AppendValues(ts, nil) + return b.NewRecord() + } + rec1 := mk([]string{"a", "b"}) + defer rec1.Release() + rec2 := mk([]string{"c", "d", "e"}) + defer rec2.Release() + + rr, err := array.NewRecordReader(schema, []arrow.RecordBatch{rec1, rec2}) + require.NoError(t, err) + defer rr.Release() + + res, err := env.client.Ingest(context.Background(), "duck_ingest.events", rr) + require.NoError(t, err) + assert.Equal(t, int64(5), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 5, count) +} + +// TestIngest_DuckDB_Bulk — 50k rows via the typed Go client + NewLazyReader +// (lazy generation, never materialised), with post-POST COUNT(*) timing +// check against a fresh READ_ONLY verifier. +func TestIngest_DuckDB_Bulk(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-bulk" + ) + + pool := memory.NewGoAllocator() + schema := eventsArrowSchema() + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + batchIdx := 0 + reader := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if batchIdx >= numBatches { + return nil, nil + } + rec := buildEventsBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix, base) + batchIdx++ + return rec, nil + }) + defer reader.Release() + + start := time.Now() + res, err := env.client.Ingest(context.Background(), "duck_ingest.events", reader) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + // post-POST COUNT(*) through a fresh READ_ONLY connection — synchronicity. + ro := env.openRO(t) + defer ro.Close() + countStart := time.Now() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible the moment POST returns") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + // Spot-check first 5 rows by content. + ro2 := env.openRO(t) + defer ro2.Close() + rows, err := ro2.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) + require.NoError(t, err) + defer rows.Close() + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) + for rows.Next() { + var n string + var v float64 + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{ + namePrefix + "-000000", namePrefix + "-000001", namePrefix + "-000002", + namePrefix + "-000003", namePrefix + "-000004", + }, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + // row%5 == 0 ⇒ payload IS NULL; only row 0 in the first five. + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + ro3 := env.openRO(t) + defer ro3.Close() + var activeCount int + require.NoError(t, ro3.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + + t.Logf("bulk ingest via Go client: %d rows in %d batches in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_DuckDB_Stream — IngestStream happy path with a pre-serialised +// Arrow buffer. +func TestIngest_DuckDB_Stream(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + b.Field(0).(*array.StringBuilder).AppendValues([]string{"s1", "s2"}, nil) + b.Field(1).(*array.Float64Builder).AppendValues([]float64{10, 20}, nil) + b.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true, false}, nil) + rec := b.NewRecord() + b.Release() + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + res, err := env.client.IngestStream(context.Background(), "duck_ingest.events", &buf) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 2, count) +} + +func TestIngest_DuckDB_Stream_Empty(t *testing.T) { + env := setupEnv(t) + _, err := env.client.IngestStream(context.Background(), "duck_ingest.events", nil) + require.Error(t, err) + assert.Contains(t, err.Error(), "body is nil") + + _, err = env.client.IngestStream(context.Background(), "", bytes.NewReader([]byte{})) + require.Error(t, err) + assert.Contains(t, err.Error(), "data_object") +} + +// TestIngest_DuckDB_ArrowIPCFile_StreamFormat — 50k×1000 stream-format file +// → IngestArrowIPCFile → byte-forwarded to /ipc/ingest. +func TestIngest_DuckDB_ArrowIPCFile_StreamFormat(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-fs" + ) + path := filepath.Join(t.TempDir(), "events_stream.arrows") + writeEventsArrowFile(t, path, namePrefix, arrowStreamFormat, numBatches, rowsPerBatch) + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.NotEqual(t, "ARROW1", string(head[:6]), "stream format must not start with ARROW1 magic") + + start := time.Now() + res, err := env.client.IngestArrowIPCFile(context.Background(), "duck_ingest.events", path) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + + t.Logf("arrow ipc stream file ingest: %d rows from %d-batch file in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_DuckDB_ArrowIPCFile_FileFormat — 50k×1000 file-format (.arrow, +// ARROW1 magic + footer) → IngestArrowIPCFile detects magic, re-streams via +// ipc.FileReader. +func TestIngest_DuckDB_ArrowIPCFile_FileFormat(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-ff" + ) + path := filepath.Join(t.TempDir(), "events_file.arrow") + writeEventsArrowFile(t, path, namePrefix, arrowFileFmt, numBatches, rowsPerBatch) + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.Equal(t, "ARROW1", string(head[:6]), "file format must start with ARROW1 magic") + + start := time.Now() + res, err := env.client.IngestArrowIPCFile(context.Background(), "duck_ingest.events", path) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + + t.Logf("arrow ipc file-format ingest: %d rows from %d-batch file in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +func TestIngest_DuckDB_ArrowIPCFile_NotFound(t *testing.T) { + env := setupEnv(t) + _, err := env.client.IngestArrowIPCFile(context.Background(), "duck_ingest.events", + filepath.Join(t.TempDir(), "does-not-exist.arrows")) + require.Error(t, err) +} + +// TestIngest_DuckDB_LazyReader — alias: bulk ingest via NewLazyReader, but +// keeping symmetry with PG suite name. Same scenario as Bulk above but with +// a distinct prefix so the suite can run all tests against a single setup +// without collisions if combined. +func TestIngest_DuckDB_LazyReader(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-lz" + ) + pool := memory.NewGoAllocator() + schema := eventsArrowSchema() + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + batchIdx := 0 + reader := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if batchIdx >= numBatches { + return nil, nil + } + rec := buildEventsBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix, base) + batchIdx++ + return rec, nil + }) + defer reader.Release() + + start := time.Now() + res, err := env.client.Ingest(context.Background(), "duck_ingest.events", reader) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + + t.Logf("lazy-reader bulk ingest: %d rows in %d batches in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_LazyReader_Termination_DuckDB — engine-agnostic unit-style test +// for NewLazyReader's termination semantics. Doesn't need the server, but +// mirrors the PG suite for full symmetry. +func TestIngest_LazyReader_Termination_DuckDB(t *testing.T) { + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + mk := func(v int32) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).Append(v) + return b.NewRecord() + } + + // gen returns batches then nil → clean end-of-stream. + { + i := 0 + r := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i >= 3 { + return nil, nil + } + i++ + return mk(int32(i)), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + require.NoError(t, r.Err()) + assert.Equal(t, 3, seen) + assert.False(t, r.Next(), "Next after end-of-stream stays false") + } + + // gen returns an error → Err() exposes it, stream terminates. + { + errBoom := errors.New("boom") + i := 0 + r := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i == 2 { + return nil, errBoom + } + i++ + return mk(int32(i)), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + assert.Equal(t, 2, seen, "yielded batches before failing call") + require.Error(t, r.Err()) + assert.ErrorIs(t, r.Err(), errBoom) + } +} + +// TestIngest_HTTP_Direct_DuckDB exercises low-level HTTP behaviour against +// /ipc/ingest (bad Content-Type, missing data_object, wrong method, invalid +// body) plus a real-world bulk path streamed through io.Pipe straight into +// the request body. Mirrors TestIngest_HTTP_Direct from the PG suite. +func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { + env := setupEnv(t) + + // Missing data_object. + resp, err := http.Post(env.server.URL+"/ipc/ingest", "application/vnd.apache.arrow.stream", bytes.NewReader(nil)) + require.NoError(t, err) + b, _ := io.ReadAll(resp.Body) + resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) + + // Wrong method. + req, _ := http.NewRequest(http.MethodGet, env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", nil) + resp, err = http.DefaultClient.Do(req) + require.NoError(t, err) + resp.Body.Close() + assert.Equal(t, http.StatusMethodNotAllowed, resp.StatusCode) + + // Wrong content type. + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", + "text/plain", bytes.NewReader([]byte("hello"))) + require.NoError(t, err) + resp.Body.Close() + assert.Equal(t, http.StatusUnsupportedMediaType, resp.StatusCode) + + // Body is not a valid Arrow stream. + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", + "application/vnd.apache.arrow.stream", bytes.NewReader([]byte("not arrow"))) + require.NoError(t, err) + b, _ = io.ReadAll(resp.Body) + resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) + + // Happy path — single small record. + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + bld := array.NewRecordBuilder(pool, schema) + bld.Field(0).(*array.StringBuilder).AppendValues([]string{"direct"}, nil) + bld.Field(1).(*array.Float64Builder).AppendValues([]float64{42}, nil) + bld.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true}, nil) + rec := bld.NewRecord() + bld.Release() + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + var out hugrclient.IngestResult + require.NoError(t, json.NewDecoder(resp.Body).Decode(&out)) + resp.Body.Close() + assert.Equal(t, int64(1), out.Inserted) + + // --- Real-world bulk via io.Pipe streamed into the request body. + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-direct" + ) + bulkSchema := eventsArrowSchema() + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + w := ipc.NewWriter(pw, ipc.WithSchema(bulkSchema)) + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + var streamErr error + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + batchRec := buildEventsBatch(pool, bulkSchema, batchIdx, rowsPerBatch, namePrefix, base) + if werr := w.Write(batchRec); werr != nil { + streamErr = fmt.Errorf("write batch %d: %w", batchIdx, werr) + batchRec.Release() + break + } + batchRec.Release() + } + if cerr := w.Close(); cerr != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", cerr) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + start := time.Now() + bulkResp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", + "application/vnd.apache.arrow.stream", pr) + werr := <-writeErr + require.NoError(t, werr, "writer goroutine failed") + require.NoError(t, postErr) + require.Equal(t, http.StatusOK, bulkResp.StatusCode) + var bulkResult hugrclient.IngestResult + require.NoError(t, json.NewDecoder(bulkResp.Body).Decode(&bulkResult)) + bulkResp.Body.Close() + elapsed := time.Since(start) + assert.Equal(t, int64(totalRows), bulkResult.Inserted) + + ro := env.openRO(t) + defer ro.Close() + countStart := time.Now() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'dk-direct-%'").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all dk-direct rows visible immediately after POST") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + t.Logf("bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// --- helpers -------------------------------------------------------------- + +type arrowFileFormat int + +const ( + arrowStreamFormat arrowFileFormat = iota + arrowFileFmt +) + +func eventsArrowSchema() *arrow.Schema { + return arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) +} + +// buildEventsBatch produces one RecordBatch of `rowsPerBatch` rows for the +// events schema. Row payload pattern matches the PG bulk fixtures so the +// spot-check assertions are reusable. +func buildEventsBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string, base time.Time) arrow.RecordBatch { + rb := array.NewRecordBuilder(pool, schema) + defer rb.Release() + names := rb.Field(0).(*array.StringBuilder) + values := rb.Field(1).(*array.Float64Builder) + active := rb.Field(2).(*array.BooleanBuilder) + payloads := rb.Field(3).(*array.StringBuilder) + ts := rb.Field(4).(*array.TimestampBuilder) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + names.Append(fmt.Sprintf("%s-%06d", namePrefix, row)) + values.Append(float64(row) * 0.5) + active.Append(row%2 == 0) + if row%5 == 0 { + payloads.AppendNull() + } else { + payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + ts.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + return rb.NewRecord() +} + +// writeEventsArrowFile writes an Arrow IPC file (stream or file format) at +// path with `numBatches * rowsPerBatch` rows for the events schema. +func writeEventsArrowFile(t *testing.T, path, namePrefix string, format arrowFileFormat, numBatches, rowsPerBatch int) { + t.Helper() + pool := memory.NewGoAllocator() + schema := eventsArrowSchema() + + f, err := os.Create(path) + require.NoError(t, err) + defer f.Close() + + type writer interface { + Write(arrow.RecordBatch) error + Close() error + } + var w writer + switch format { + case arrowStreamFormat: + w = ipc.NewWriter(f, ipc.WithSchema(schema)) + case arrowFileFmt: + fw, ferr := ipc.NewFileWriter(f, ipc.WithSchema(schema)) + require.NoError(t, ferr) + w = fw + default: + t.Fatalf("unknown arrow file format: %d", format) + } + + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rec := buildEventsBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix, base) + require.NoError(t, w.Write(rec)) + rec.Release() + } + require.NoError(t, w.Close()) +} + +// Silence "imported and not used" if a refactor leaves a quoted ref around. +var _ atomic.Int64 diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql new file mode 100644 index 00000000..bdd25b85 --- /dev/null +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -0,0 +1,8 @@ +type events @table(name: "events") { + id: BigInt! @pk @default(sequence: "events_id_seq") + name: String! + value: Float! + is_active: Boolean! @default(value: "true") + payload: JSON + created_at: Timestamp @default(value: "now()") +} From c041af27b960a0c1f3764f1593f70f4e3f6f4a77 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 31 May 2026 13:20:17 +0400 Subject: [PATCH 07/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 166 +++++++++++------- integration-test/ingest/ingest_test.go | 114 ++++++++---- 2 files changed, 176 insertions(+), 104 deletions(-) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index 9528bbe7..d631f453 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "io" + "log" "net/http" "net/http/httptest" "os" @@ -33,23 +34,60 @@ import ( "github.com/hugr-lab/query-engine/pkg/db" ) -// ingestEnv mirrors the Postgres counterpart but everything runs in-process -// against a local .duckdb file in t.TempDir(): -// - the .duckdb file is created and seeded (CREATE SEQUENCE + CREATE TABLE) -// via a direct sql.Open("duckdb", path) connection; -// - that connection is then CLOSED — only one process can hold the write -// lock, and we hand the file over to hugr next; -// - hugr is started in-process, registers the file as a "duckdb" data -// source named "duck_ingest" and ATTACHes it; -// - the verifier sql.DB is re-opened in READ_ONLY mode (DuckDB allows -// concurrent read-only connections while another writer holds the file), -// giving the test an independent view of the data — analogous to the -// pgx-based pgConn in the Postgres test suite. +// ingestEnv is per-test state on top of a shared hugr.Service (initialised +// once in TestMain). Each test owns a unique .duckdb file and a unique data +// source name, so tests don't share table state. Cleanup unloads the source +// to DETACH the file before t.TempDir() removes it. type ingestEnv struct { - service *hugr.Service - server *httptest.Server - client *hugrclient.Client - dbPath string + service *hugr.Service + server *httptest.Server + client *hugrclient.Client + dbPath string + dsName string // unique data source / catalog prefix, e.g. "duck_ingest_3" + dataObject string // dsName + ".events" +} + +// Shared service initialised once for the whole package — see TestMain. +// hugr.New + service.Init costs ~17s; doing it once cuts the package +// wall-clock from 13×17s ≈ 3.5min down to one-off ~17s + ~ms/test. +var ( + sharedService *hugr.Service + sharedServer *httptest.Server + sharedClient *hugrclient.Client + dsCounter atomic.Int64 +) + +func TestMain(m *testing.M) { + ctx := context.Background() + + service, err := hugr.New(hugr.Config{ + Debug: false, // shared service runs many tests — keep logs quiet + DB: db.Config{}, + CoreDB: coredb.New(coredb.Config{}), + Auth: &auth.Config{ + Providers: []auth.AuthProvider{ + auth.NewAnonymous(auth.AnonymousConfig{ + Allowed: true, + Role: "admin", + }), + }, + }, + }) + if err != nil { + log.Fatalf("hugr.New: %v", err) + } + if err := service.Init(ctx); err != nil { + log.Fatalf("service.Init: %v", err) + } + sharedService = service + sharedServer = httptest.NewServer(service) + sharedClient = hugrclient.NewClient(sharedServer.URL + "/ipc") + + code := m.Run() + + sharedServer.Close() + _ = service.Close() + os.Exit(code) } // openRO returns a fresh READ_ONLY sql.DB handle to the events database. @@ -69,7 +107,9 @@ func setupEnv(t *testing.T) *ingestEnv { t.Helper() ctx := context.Background() - dbPath := filepath.Join(t.TempDir(), "test.duckdb") + n := dsCounter.Add(1) + dsName := fmt.Sprintf("duck_ingest_%d", n) + dbPath := filepath.Join(t.TempDir(), fmt.Sprintf("test_%d.duckdb", n)) // 1. Seed schema with a private writer; close before hugr opens it. seed, err := sql.Open("duckdb", dbPath) @@ -93,56 +133,48 @@ func setupEnv(t *testing.T) *ingestEnv { require.NoError(t, err) require.DirExists(t, schemaDir) - // 3. Start hugr in-process. - service, err := hugr.New(hugr.Config{ - Debug: true, - DB: db.Config{}, - CoreDB: coredb.New(coredb.Config{}), - Auth: &auth.Config{ - Providers: []auth.AuthProvider{ - auth.NewAnonymous(auth.AnonymousConfig{ - Allowed: true, - Role: "admin", - }), - }, - }, - }) - require.NoError(t, err) - require.NoError(t, service.Init(ctx)) - - // 4. Register & load the duckdb data source pointed at the file. - mustQuery(t, ctx, service, `mutation($data: core_data_sources_mut_input_data!) { + // 3. Register & load this test's unique data source on the SHARED service. + mustQuery(t, ctx, sharedService, `mutation($data: core_data_sources_mut_input_data!) { core { insert_data_sources(data: $data) { name } } }`, map[string]any{ "data": map[string]any{ - "name": "duck_ingest", + "name": dsName, "type": "duckdb", - "prefix": "duck_ingest", + "prefix": dsName, "as_module": true, "path": dbPath, "catalogs": []map[string]any{{ - "name": "duck_ingest", + "name": dsName, "type": "localFS", "path": schemaDir, }}, }, }) - mustQuery(t, ctx, service, `mutation { function { core { load_data_source(name: "duck_ingest") { success message } } } }`, nil) - - srv := httptest.NewServer(service) - - c := hugrclient.NewClient(srv.URL + "/ipc") + mustQuery(t, ctx, sharedService, `mutation($name: String!) { + function { core { load_data_source(name: $name) { success message } } } + }`, map[string]any{"name": dsName}) env := &ingestEnv{ - service: service, - server: srv, - client: c, - dbPath: dbPath, + service: sharedService, + server: sharedServer, + client: sharedClient, + dbPath: dbPath, + dsName: dsName, + dataObject: dsName + ".events", } + + // Unload on test completion so DETACH releases the .duckdb file before + // t.TempDir() removes it. Best-effort: ignore errors (next test uses a + // different name + file, so a leak is harmless within a single run). t.Cleanup(func() { - srv.Close() - service.Close() + res, err := sharedService.Query(ctx, `mutation($name: String!, $hard: Boolean) { + function { core { unload_data_source(name: $name, hard: $hard) { success message } } } + }`, map[string]any{"name": dsName, "hard": true}) + if err == nil { + res.Close() + } }) + return env } @@ -199,10 +231,10 @@ func TestIngest_DuckDB_RoundTrip(t *testing.T) { ) defer rec.Release() - res, err := env.client.IngestRecord(context.Background(), "duck_ingest.events", rec) + res, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) require.NoError(t, err) require.NotNil(t, res) - assert.Equal(t, "duck_ingest.events", res.DataObject) + assert.Equal(t, env.dataObject, res.DataObject) assert.Equal(t, int64(3), res.Inserted) assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, res.Columns) @@ -258,7 +290,7 @@ func TestIngest_DuckDB_UnknownColumn(t *testing.T) { rec := b.NewRecord() defer rec.Release() - _, err := env.client.IngestRecord(context.Background(), "duck_ingest.events", rec) + _, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) require.Error(t, err) assert.Contains(t, err.Error(), "not_a_column") @@ -282,7 +314,7 @@ func TestIngest_DuckDB_UnknownDataObject(t *testing.T) { rec := b.NewRecord() defer rec.Release() - _, err := env.client.IngestRecord(context.Background(), "duck_ingest.does_not_exist", rec) + _, err := env.client.IngestRecord(context.Background(), env.dsName+".does_not_exist", rec) require.Error(t, err) } @@ -328,7 +360,7 @@ func TestIngest_DuckDB_MultipleBatches(t *testing.T) { require.NoError(t, err) defer rr.Release() - res, err := env.client.Ingest(context.Background(), "duck_ingest.events", rr) + res, err := env.client.Ingest(context.Background(), env.dataObject, rr) require.NoError(t, err) assert.Equal(t, int64(5), res.Inserted) @@ -368,7 +400,7 @@ func TestIngest_DuckDB_Bulk(t *testing.T) { defer reader.Release() start := time.Now() - res, err := env.client.Ingest(context.Background(), "duck_ingest.events", reader) + res, err := env.client.Ingest(context.Background(), env.dataObject, reader) elapsed := time.Since(start) require.NoError(t, err) assert.Equal(t, int64(totalRows), res.Inserted) @@ -449,7 +481,7 @@ func TestIngest_DuckDB_Stream(t *testing.T) { require.NoError(t, w.Write(rec)) require.NoError(t, w.Close()) - res, err := env.client.IngestStream(context.Background(), "duck_ingest.events", &buf) + res, err := env.client.IngestStream(context.Background(), env.dataObject, &buf) require.NoError(t, err) require.NotNil(t, res) assert.Equal(t, int64(2), res.Inserted) @@ -463,7 +495,7 @@ func TestIngest_DuckDB_Stream(t *testing.T) { func TestIngest_DuckDB_Stream_Empty(t *testing.T) { env := setupEnv(t) - _, err := env.client.IngestStream(context.Background(), "duck_ingest.events", nil) + _, err := env.client.IngestStream(context.Background(), env.dataObject, nil) require.Error(t, err) assert.Contains(t, err.Error(), "body is nil") @@ -491,7 +523,7 @@ func TestIngest_DuckDB_ArrowIPCFile_StreamFormat(t *testing.T) { assert.NotEqual(t, "ARROW1", string(head[:6]), "stream format must not start with ARROW1 magic") start := time.Now() - res, err := env.client.IngestArrowIPCFile(context.Background(), "duck_ingest.events", path) + res, err := env.client.IngestArrowIPCFile(context.Background(), env.dataObject, path) elapsed := time.Since(start) require.NoError(t, err) assert.Equal(t, int64(totalRows), res.Inserted) @@ -526,7 +558,7 @@ func TestIngest_DuckDB_ArrowIPCFile_FileFormat(t *testing.T) { assert.Equal(t, "ARROW1", string(head[:6]), "file format must start with ARROW1 magic") start := time.Now() - res, err := env.client.IngestArrowIPCFile(context.Background(), "duck_ingest.events", path) + res, err := env.client.IngestArrowIPCFile(context.Background(), env.dataObject, path) elapsed := time.Since(start) require.NoError(t, err) assert.Equal(t, int64(totalRows), res.Inserted) @@ -543,7 +575,7 @@ func TestIngest_DuckDB_ArrowIPCFile_FileFormat(t *testing.T) { func TestIngest_DuckDB_ArrowIPCFile_NotFound(t *testing.T) { env := setupEnv(t) - _, err := env.client.IngestArrowIPCFile(context.Background(), "duck_ingest.events", + _, err := env.client.IngestArrowIPCFile(context.Background(), env.dataObject, filepath.Join(t.TempDir(), "does-not-exist.arrows")) require.Error(t, err) } @@ -577,7 +609,7 @@ func TestIngest_DuckDB_LazyReader(t *testing.T) { defer reader.Release() start := time.Now() - res, err := env.client.Ingest(context.Background(), "duck_ingest.events", reader) + res, err := env.client.Ingest(context.Background(), env.dataObject, reader) elapsed := time.Since(start) require.NoError(t, err) assert.Equal(t, int64(totalRows), res.Inserted) @@ -664,21 +696,21 @@ func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) // Wrong method. - req, _ := http.NewRequest(http.MethodGet, env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", nil) + req, _ := http.NewRequest(http.MethodGet, env.server.URL+"/ipc/ingest?data_object="+env.dataObject, nil) resp, err = http.DefaultClient.Do(req) require.NoError(t, err) resp.Body.Close() assert.Equal(t, http.StatusMethodNotAllowed, resp.StatusCode) // Wrong content type. - resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, "text/plain", bytes.NewReader([]byte("hello"))) require.NoError(t, err) resp.Body.Close() assert.Equal(t, http.StatusUnsupportedMediaType, resp.StatusCode) // Body is not a valid Arrow stream. - resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, "application/vnd.apache.arrow.stream", bytes.NewReader([]byte("not arrow"))) require.NoError(t, err) b, _ = io.ReadAll(resp.Body) @@ -705,7 +737,7 @@ func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { require.NoError(t, w.Write(rec)) require.NoError(t, w.Close()) - resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, "application/vnd.apache.arrow.stream", &buf) require.NoError(t, err) require.Equal(t, http.StatusOK, resp.StatusCode) @@ -747,7 +779,7 @@ func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { }() start := time.Now() - bulkResp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object=duck_ingest.events", + bulkResp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, "application/vnd.apache.arrow.stream", pr) werr := <-writeErr require.NoError(t, werr, "writer goroutine failed") diff --git a/integration-test/ingest/ingest_test.go b/integration-test/ingest/ingest_test.go index a3970c46..56c1a964 100644 --- a/integration-test/ingest/ingest_test.go +++ b/integration-test/ingest/ingest_test.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "io" + "log" "net/http" "net/http/httptest" "os" @@ -38,8 +39,9 @@ const ( envSchemasPath = "HUGR_INGEST_SCHEMAS_PATH" ) -// ingestEnv encapsulates a hugr service + an HTTP test server in front of it -// plus a direct sql.DB handle to the underlying postgres for verification. +// ingestEnv is per-test view on top of a shared hugr.Service (initialised +// once in TestMain). hugr.New + service.Init costs ~17s; doing it once cuts +// the package wall-clock from N×17s down to a one-off ~17s + ~ms/test. type ingestEnv struct { service *hugr.Service server *httptest.Server @@ -48,25 +50,39 @@ type ingestEnv struct { dsName string } -func setupEnv(t *testing.T) *ingestEnv { - t.Helper() +// Shared state — set up in TestMain when the postgres DSN env var is present. +// Tests Skip when sharedService is nil (DSN not configured). +var ( + sharedService *hugr.Service + sharedServer *httptest.Server + sharedPgConn *sql.DB + sharedClient *hugrclient.Client +) + +func TestMain(m *testing.M) { dsn := os.Getenv(envPostgresDSN) if dsn == "" { - t.Skipf("%s not set — run integration-test/ingest/run.sh to spin up a postgres container", envPostgresDSN) + // No DSN configured — let tests Skip individually with a friendly + // message. Don't fail the package. + os.Exit(m.Run()) } + schemasPath := os.Getenv(envSchemasPath) if schemasPath == "" { - // fall back to repo-relative path schemasPath = filepath.Join("testdata", "schemas") } abs, err := filepath.Abs(schemasPath) - require.NoError(t, err) - require.DirExists(t, filepath.Join(abs, "pg_ingest")) + if err != nil { + log.Fatalf("resolve schemas path: %v", err) + } + if _, err := os.Stat(filepath.Join(abs, "pg_ingest")); err != nil { + log.Fatalf("schemas/pg_ingest dir not found at %s: %v", abs, err) + } ctx := context.Background() service, err := hugr.New(hugr.Config{ - Debug: true, + Debug: false, // shared service runs many tests — keep logs quiet DB: db.Config{}, CoreDB: coredb.New(coredb.Config{}), Auth: &auth.Config{ @@ -78,11 +94,15 @@ func setupEnv(t *testing.T) *ingestEnv { }, }, }) - require.NoError(t, err) - require.NoError(t, service.Init(ctx)) + if err != nil { + log.Fatalf("hugr.New: %v", err) + } + if err := service.Init(ctx); err != nil { + log.Fatalf("service.Init: %v", err) + } // Register & load the postgres data source pointed at the test database. - mustQuery(t, ctx, service, `mutation($data: core_data_sources_mut_input_data!) { + regRes, err := service.Query(ctx, `mutation($data: core_data_sources_mut_input_data!) { core { insert_data_sources(data: $data) { name } } }`, map[string]any{ "data": map[string]any{ @@ -98,43 +118,63 @@ func setupEnv(t *testing.T) *ingestEnv { }}, }, }) - mustQuery(t, ctx, service, `mutation { function { core { load_data_source(name: "pg_ingest") { success message } } } }`, nil) + if err != nil { + log.Fatalf("register pg_ingest: %v", err) + } + if regRes.Err() != nil { + log.Fatalf("register pg_ingest graphql error: %v", regRes.Err()) + } + regRes.Close() + + loadRes, err := service.Query(ctx, `mutation { function { core { load_data_source(name: "pg_ingest") { success message } } } }`, nil) + if err != nil { + log.Fatalf("load pg_ingest: %v", err) + } + if loadRes.Err() != nil { + log.Fatalf("load pg_ingest graphql error: %v", loadRes.Err()) + } + loadRes.Close() srv := httptest.NewServer(service) pgConn, err := sql.Open("pgx", dsn) - require.NoError(t, err) - require.NoError(t, pgConn.PingContext(ctx)) + if err != nil { + log.Fatalf("open pg verifier conn: %v", err) + } + if err := pgConn.PingContext(ctx); err != nil { + log.Fatalf("ping pg verifier conn: %v", err) + } - // Truncate before each suite to guarantee determinism. - _, err = pgConn.ExecContext(ctx, "TRUNCATE TABLE events RESTART IDENTITY") - require.NoError(t, err) + sharedService = service + sharedServer = srv + sharedPgConn = pgConn + sharedClient = hugrclient.NewClient(srv.URL + "/ipc") - c := hugrclient.NewClient(srv.URL + "/ipc") + code := m.Run() - env := &ingestEnv{ - service: service, - server: srv, - pgConn: pgConn, - client: c, - dsName: "pg_ingest", - } - t.Cleanup(func() { - srv.Close() - _ = pgConn.Close() - service.Close() - }) - return env + _ = pgConn.Close() + srv.Close() + _ = service.Close() + os.Exit(code) } -func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, vars map[string]any) { +func setupEnv(t *testing.T) *ingestEnv { t.Helper() - res, err := s.Query(ctx, q, vars) + if sharedService == nil { + t.Skipf("%s not set — run integration-test/ingest/run.sh to spin up a postgres container", envPostgresDSN) + } + + // Truncate before each test to guarantee determinism (single shared table). + _, err := sharedPgConn.ExecContext(context.Background(), "TRUNCATE TABLE events RESTART IDENTITY") require.NoError(t, err) - if res.Err() != nil { - require.NoErrorf(t, res.Err(), "graphql error for query: %s", q) + + return &ingestEnv{ + service: sharedService, + server: sharedServer, + pgConn: sharedPgConn, + client: sharedClient, + dsName: "pg_ingest", } - res.Close() } // makeEventsRecord builds a single Arrow RecordBatch with the columns of the From 3a63db2e8b295cfe1854c3ab47aca2a2e53274b8 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 31 May 2026 22:39:36 +0400 Subject: [PATCH 08/36] ipc ingest --- .../{ingest => ingest-postgres}/docker-compose.yml | 0 .../ingest_postgres_test.go} | 4 ++-- integration-test/{ingest => ingest-postgres}/run.sh | 2 +- .../{ingest => ingest-postgres}/testdata/init.sql | 0 .../testdata/schemas/pg_ingest/schema.graphql | 0 5 files changed, 3 insertions(+), 3 deletions(-) rename integration-test/{ingest => ingest-postgres}/docker-compose.yml (100%) rename integration-test/{ingest/ingest_test.go => ingest-postgres/ingest_postgres_test.go} (99%) rename integration-test/{ingest => ingest-postgres}/run.sh (98%) rename integration-test/{ingest => ingest-postgres}/testdata/init.sql (100%) rename integration-test/{ingest => ingest-postgres}/testdata/schemas/pg_ingest/schema.graphql (100%) diff --git a/integration-test/ingest/docker-compose.yml b/integration-test/ingest-postgres/docker-compose.yml similarity index 100% rename from integration-test/ingest/docker-compose.yml rename to integration-test/ingest-postgres/docker-compose.yml diff --git a/integration-test/ingest/ingest_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go similarity index 99% rename from integration-test/ingest/ingest_test.go rename to integration-test/ingest-postgres/ingest_postgres_test.go index 56c1a964..8be37754 100644 --- a/integration-test/ingest/ingest_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -1,6 +1,6 @@ //go:build duckdb_arrow -package ingest_test +package ingest_postgres_test import ( "bytes" @@ -161,7 +161,7 @@ func TestMain(m *testing.M) { func setupEnv(t *testing.T) *ingestEnv { t.Helper() if sharedService == nil { - t.Skipf("%s not set — run integration-test/ingest/run.sh to spin up a postgres container", envPostgresDSN) + t.Skipf("%s not set — run integration-test/ingest-postgres/run.sh to spin up a postgres container", envPostgresDSN) } // Truncate before each test to guarantee determinism (single shared table). diff --git a/integration-test/ingest/run.sh b/integration-test/ingest-postgres/run.sh similarity index 98% rename from integration-test/ingest/run.sh rename to integration-test/ingest-postgres/run.sh index 61d0a6d9..a3c4a241 100755 --- a/integration-test/ingest/run.sh +++ b/integration-test/ingest-postgres/run.sh @@ -26,4 +26,4 @@ export INGEST_POSTGRES_DSN="postgres://test:test@127.0.0.1:5437/ingestdb" export HUGR_INGEST_SCHEMAS_PATH="$HERE/testdata/schemas" cd "$HERE/../.." -go test -tags=duckdb_arrow -count=1 -v ./integration-test/ingest/... +go test -tags=duckdb_arrow -count=1 -v ./integration-test/ingest-postgres/... diff --git a/integration-test/ingest/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql similarity index 100% rename from integration-test/ingest/testdata/init.sql rename to integration-test/ingest-postgres/testdata/init.sql diff --git a/integration-test/ingest/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql similarity index 100% rename from integration-test/ingest/testdata/schemas/pg_ingest/schema.graphql rename to integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql From db98e94b6a8d08a3bd19a72ccc83855ff6413b63 Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 6 Jun 2026 20:36:13 +0400 Subject: [PATCH 09/36] ipc ingest --- ipc-ingest.go | 271 +++----------------------- pkg/engines/duckdb.go | 60 ++++++ pkg/engines/engines.go | 2 + pkg/engines/postgres.go | 5 + pkg/planner/node_ingest.go | 379 +++++++++++++++++++++++++++++++++++++ pkg/planner/plan.go | 53 +++++- pkg/planner/planer.go | 18 +- 7 files changed, 537 insertions(+), 251 deletions(-) create mode 100644 pkg/planner/node_ingest.go diff --git a/ipc-ingest.go b/ipc-ingest.go index 68eb9c27..983fa69e 100644 --- a/ipc-ingest.go +++ b/ipc-ingest.go @@ -1,7 +1,6 @@ package hugr import ( - "context" "encoding/json" "errors" "fmt" @@ -11,19 +10,13 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/ipc" "github.com/apache/arrow-go/v18/arrow/memory" - "github.com/google/uuid" "github.com/hugr-lab/query-engine/pkg/auth" - "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" - "github.com/hugr-lab/query-engine/pkg/catalog/sdl" - "github.com/hugr-lab/query-engine/pkg/engines" "github.com/hugr-lab/query-engine/pkg/perm" - "github.com/vektah/gqlparser/v2/ast" ) const ( - ingestContentType = "application/vnd.apache.arrow.stream" - ingestDataObjectArg = "data_object" - ingestViewNamePrefix = "_hugr_ingest_" + ingestContentType = "application/vnd.apache.arrow.stream" + ingestDataObjectArg = "data_object" ) // IngestResponse is the success payload returned by /ipc/ingest. @@ -38,21 +31,9 @@ type ingestErrorBody struct { } // ipcIngestHandler accepts an Apache Arrow IPC stream in the request body and -// inserts the records into the target data object referenced by the -// `data_object` query parameter. -// -// Wire protocol (first iteration): -// - Method: POST -// - URL: /ipc/ingest?data_object= -// - Headers: Content-Type: application/vnd.apache.arrow.stream -// - Body: Arrow IPC stream (schema + record batches) -// - Response: 200 OK, JSON {"data_object": ..., "inserted": N, "columns": [...]} -// -// Restrictions intentionally enforced on this iteration (per design): -// - INSERT only (no on-conflict / merge / upsert / returning) -// - target must be a table data object (views are rejected) -// - reference fields are skipped (not insertable through this path) -// - permissions are checked against the synthetic insert mutation input +// inserts it into a table data object. The planner resolves the target schema, +// validates insert inputs/permissions, casts Arrow values, and builds the +// INSERT FROM SELECT statement over a temporary Arrow view. func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") @@ -74,8 +55,6 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { } ctx := r.Context() - // Auth middleware already populated permissions; make sure we have them - // (handles direct-handler callers that bypass the middleware in tests). if perm.PermissionsFromCtx(ctx) == nil { newCtx, err := s.perm.ContextWithPermissions(ctx) if err != nil { @@ -89,19 +68,6 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { ctx = newCtx } - info, mutationField, err := s.resolveIngestTarget(ctx, dataObject) - if err != nil { - writeIngestError(w, http.StatusBadRequest, err.Error()) - return - } - - eng, err := s.ds.Engine(info.Catalog) - if err != nil { - writeIngestError(w, http.StatusBadRequest, - fmt.Sprintf("engine for catalog %q not available: %v", info.Catalog, err)) - return - } - reader, err := ipc.NewReader(r.Body, ipc.WithAllocator(memory.NewGoAllocator())) if err != nil { writeIngestError(w, http.StatusBadRequest, "invalid arrow stream: "+err.Error()) @@ -109,240 +75,47 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { } defer reader.Release() - columns, err := resolveIngestColumns(reader.Schema(), info) - if err != nil { - writeIngestError(w, http.StatusBadRequest, err.Error()) - return - } - if len(columns) == 0 { - writeIngestError(w, http.StatusBadRequest, - "no insertable columns matched between arrow stream and data object") - return + plan, cancel, err := s.planner.PlanIngest(ctx, s.schema.Provider(), dataObject, reader) + if cancel != nil { + defer func() { _ = cancel() }() } - - if err := checkIngestPermission(ctx, info, mutationField, columns); err != nil { + if err != nil { if errors.Is(err, auth.ErrForbidden) { writeIngestError(w, http.StatusForbidden, err.Error()) return } - writeIngestError(w, http.StatusInternalServerError, err.Error()) + writeIngestError(w, http.StatusBadRequest, err.Error()) + return + } + if err := plan.Compile(); err != nil { + writeIngestError(w, http.StatusBadRequest, err.Error()) return } - inserted, err := s.executeIngest(ctx, info, eng, reader, columns) + res, err := plan.ExecuteExec(ctx, s.db) if err != nil { writeIngestError(w, http.StatusInternalServerError, err.Error()) return } + inserted, _ := res.RowsAffected() out := IngestResponse{ DataObject: dataObject, Inserted: inserted, - Columns: columnNames(columns), + Columns: ingestSchemaColumnNames(reader.Schema()), } _ = json.NewEncoder(w).Encode(out) } -// ingestColumn binds an Arrow input column to an SDL field of the target table. -type ingestColumn struct { - ArrowName string // column name as it appears in the incoming Arrow schema - Field *sdl.Field // resolved SDL field of the target data object -} - -func columnNames(cs []ingestColumn) []string { - out := make([]string, len(cs)) - for i, c := range cs { - out[i] = c.ArrowName - } - return out -} - -// resolveIngestTarget walks the GraphQL schema to find the target data object -// and the corresponding insert mutation field. The dataObject argument can be -// either a dotted Query path (e.g. "pg_store.public.events") or a bare GraphQL -// type name (e.g. "pg_store_public_events"). -func (s *Service) resolveIngestTarget(ctx context.Context, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { - provider := s.schema.Provider() - - var def *ast.Definition - if strings.Contains(dataObject, ".") { - queryDef := provider.ForName(ctx, base.QueryBaseName) - if queryDef == nil { - return nil, nil, fmt.Errorf("query base type not found in schema") - } - cur := queryDef - for _, part := range strings.Split(dataObject, ".") { - f := cur.Fields.ForName(part) - if f == nil { - return nil, nil, fmt.Errorf("data object %q: segment %q not found", dataObject, part) - } - cur = provider.ForName(ctx, f.Type.Name()) - if cur == nil { - return nil, nil, fmt.Errorf("data object %q: type %q not found", dataObject, f.Type.Name()) - } - } - def = cur - } else { - def = provider.ForName(ctx, dataObject) - } - if def == nil { - return nil, nil, fmt.Errorf("data object %q not found in schema", dataObject) - } - if !sdl.IsDataObject(def) { - return nil, nil, fmt.Errorf("%q is not a data object", dataObject) - } - info := sdl.DataObjectInfo(def) - if info == nil { - return nil, nil, fmt.Errorf("data object %q: no info", dataObject) - } - if info.Type != sdl.TableDataObject { - return nil, nil, fmt.Errorf("data object %q is not a table (got %q) — only tables are ingestable", dataObject, info.Type) - } - if info.Catalog == "" { - return nil, nil, fmt.Errorf("data object %q has no catalog", dataObject) - } - - // Find the insert mutation field; we need it for permission checks. - _, mutationField := sdl.ObjectMutationDefinition(ctx, provider, def, sdl.MutationTypeInsert) - if mutationField == nil { - return nil, nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) - } - return info, mutationField, nil -} - -// resolveIngestColumns maps Arrow schema fields onto SDL fields of the table. -// Reference / virtual / computed fields are intentionally rejected because -// they are not directly insertable. -func resolveIngestColumns(schema *arrow.Schema, info *sdl.Object) ([]ingestColumn, error) { +func ingestSchemaColumnNames(schema *arrow.Schema) []string { if schema == nil { - return nil, fmt.Errorf("arrow stream has no schema") - } - cols := make([]ingestColumn, 0, schema.NumFields()) - seen := map[string]struct{}{} - for _, f := range schema.Fields() { - if _, dup := seen[f.Name]; dup { - return nil, fmt.Errorf("duplicate arrow column %q", f.Name) - } - seen[f.Name] = struct{}{} - - fi := info.FieldForName(f.Name) - if fi == nil { - return nil, fmt.Errorf("column %q is not defined in data object %q", - f.Name, info.Definition().Name) - } - if fi.IsReferencesSubquery() { - return nil, fmt.Errorf("column %q is a reference and cannot be ingested directly", - f.Name) - } - if fi.IsNotDBField() { - return nil, fmt.Errorf("column %q is a computed/virtual field and cannot be ingested", - f.Name) - } - if fi.FieldSourceName("", false) == "-" { - return nil, fmt.Errorf("column %q has no database mapping", f.Name) - } - cols = append(cols, ingestColumn{ArrowName: f.Name, Field: fi}) - } - return cols, nil -} - -// checkIngestPermission verifies that the caller may invoke the insert mutation -// and write each of the supplied columns. It mirrors RolePermissions.CheckQuery -// + CheckMutationInput but operates on the synthetic per-column payload that -// an Arrow batch represents. -func checkIngestPermission(ctx context.Context, info *sdl.Object, mutationField *ast.FieldDefinition, cols []ingestColumn) error { - if auth.IsFullAccess(ctx) { - return nil - } - rp := perm.PermissionsFromCtx(ctx) - if rp == nil { - // No permissions configured = allow (matches behaviour of - // CheckQuery callers in the rest of the engine). - return nil - } - if rp.Disabled { - return auth.ErrForbidden - } - - // 1) mutation field itself must be enabled on the parent module type. - parent := "" - if mutationField != nil { - if pd, ok := mutationFieldParent(info); ok { - parent = pd - } - if _, ok := rp.Enabled(parent, mutationField.Name); !ok { - return auth.ErrForbidden - } - } - - // 2) each ingested column must be enabled on the insert input type. - inputName := info.InputInsertDataName() - if inputName == "" { return nil } - for _, c := range cols { - if _, ok := rp.Enabled(inputName, c.ArrowName); !ok { - return auth.ErrForbidden - } - } - return nil -} - -// mutationFieldParent returns the name of the GraphQL type that owns the -// insert mutation field for this data object. That type is the field-level -// permission scope (Permission.Object) used by RolePermissions.Enabled. -func mutationFieldParent(info *sdl.Object) (string, bool) { - mod := sdl.ObjectModule(info.Definition()) - return sdl.ModuleTypeName(mod, sdl.ModuleMutation), true -} - -// executeIngest registers the Arrow record reader as a DuckDB view and runs -// `INSERT INTO () SELECT FROM `. The view is bound -// to the underlying DuckDB driver connection and released after the INSERT -// completes (success or failure). -func (s *Service) executeIngest(ctx context.Context, info *sdl.Object, eng engines.Engine, reader *ipc.Reader, cols []ingestColumn) (int64, error) { - ar, err := s.db.Arrow(ctx) - if err != nil { - return 0, fmt.Errorf("acquire duckdb arrow conn: %w", err) - } - defer ar.Close() - - viewName := ingestViewNamePrefix + strings.ReplaceAll(uuid.NewString(), "-", "") - release, err := ar.RegisterView(reader, viewName) - if err != nil { - return 0, fmt.Errorf("register arrow view: %w", err) - } - defer release() - - sqlStr := buildIngestSQL(ctx, info, eng, cols, viewName) - res, err := ar.Exec(ctx, sqlStr) - if err != nil { - return 0, fmt.Errorf("ingest insert failed: %w", err) - } - n, _ := res.RowsAffected() - return n, nil -} - -// buildIngestSQL constructs the INSERT ... SELECT statement that drains the -// registered Arrow view into the target table. The target is fully qualified -// with the catalog (data-source) identifier so that DuckDB's postgres -// extension can route the INSERT through the attached database. -func buildIngestSQL(ctx context.Context, info *sdl.Object, eng engines.Engine, cols []ingestColumn, viewName string) string { - target := info.SQL(ctx, engines.Ident(info.Catalog)) - - colNames := make([]string, len(cols)) - selectExprs := make([]string, len(cols)) - for i, c := range cols { - colNames[i] = c.Field.FieldSourceName("", true) - selectExprs[i] = engines.Ident(c.ArrowName) + out := make([]string, 0, schema.NumFields()) + for _, f := range schema.Fields() { + out = append(out, f.Name) } - - return fmt.Sprintf("INSERT INTO %s (%s) SELECT %s FROM %s", - target, - strings.Join(colNames, ", "), - strings.Join(selectExprs, ", "), - engines.Ident(viewName), - ) + return out } func writeIngestError(w http.ResponseWriter, status int, msg string) { diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index 4b1ef5a4..bc516925 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -8,6 +8,7 @@ import ( "strings" "time" + "github.com/apache/arrow-go/v18/arrow" "github.com/hugr-lab/query-engine/pkg/catalog/compiler" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" @@ -83,6 +84,65 @@ func (e *DuckDB) Capabilities() *compiler.EngineCapabilities { } } +func CastArrowIngestValueToDuckDB(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { + if field == nil || field.Definition == nil { + return sql, nil + } + switch field.Definition.Type.Name() { + case base.JSONTypeName: + switch arrowField.Type.ID() { + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW, + arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: + return "try_cast(" + sql + " AS JSON)", nil + case arrow.STRUCT, arrow.LIST, arrow.LARGE_LIST, arrow.FIXED_SIZE_LIST, + arrow.LIST_VIEW, arrow.LARGE_LIST_VIEW, arrow.MAP: + return "to_json(" + sql + ")", nil + default: + return sql, nil + } + case base.GeometryTypeName: + return castArrowGeometryToDuckDB(arrowField, sql) + default: + return sql, nil + } +} + +func arrowExtensionName(field arrow.Field) string { + if ext, ok := field.Metadata.GetValue("ARROW:extension:name"); ok { + return strings.ToLower(ext) + } + if ext, ok := field.Metadata.GetValue("extension:name"); ok { + return strings.ToLower(ext) + } + return "" +} + +func castArrowGeometryToDuckDB(field arrow.Field, sql string) (string, error) { + switch arrowExtensionName(field) { + case "geoarrow.wkb": + return "ST_GeomFromWKB(" + sql + ")", nil + case "geoarrow.wkt": + return "ST_GeomFromText(" + sql + ", true)", nil + case "hugr.geojson", "geoarrow.geojson", "geojson": + return "ST_GeomFromGeoJSON(" + sql + ")", nil + case "geoarrow.point", "geoarrow.linestring", "geoarrow.polygon", + "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", + "geoarrow.geometry", "geoarrow.geometrycollection": + return sql, nil + } + + switch field.Type.ID() { + case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: + return "ST_GeomFromWKB(" + sql + ")", nil + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: + return "CASE WHEN starts_with(trim(" + sql + "), '{') THEN ST_GeomFromGeoJSON(" + sql + ") ELSE ST_GeomFromText(" + sql + ", true) END", nil + case arrow.STRUCT, arrow.MAP: + return "ST_GeomFromGeoJSON(to_json(" + sql + ")::VARCHAR)", nil + default: + return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Geometry without geoarrow/hugr metadata", field.Name, field.Type) + } +} + func (e *DuckDB) FieldValueByPath(sqlName, path string) string { if path == "" { return sqlName diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index 25f91f8e..8041b8dd 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -5,6 +5,7 @@ import ( "fmt" "strings" + "github.com/apache/arrow-go/v18/arrow" "github.com/hugr-lab/query-engine/pkg/catalog/compiler" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" @@ -70,6 +71,7 @@ type EngineTypeCaster interface { Engine ToIntermediateType(*ast.Field) (string, error) CastFromIntermediateType(field *ast.Field, toJSON bool) (string, error) + CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) } type EngineVectorDistanceCalculator interface { diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 970d1bb6..55f7e5e2 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/apache/arrow-go/v18/arrow" "github.com/hugr-lab/query-engine/pkg/catalog/compiler" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" @@ -592,6 +593,10 @@ func (e *Postgres) CastFromIntermediateType(f *ast.Field, toJSON bool) (string, return Ident(f.Alias), nil } +func (e *Postgres) CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { + return CastArrowIngestValueToDuckDB(field, arrowField, sql) +} + func pgRangeValueToSQLValue(v any) (string, error) { if v == nil { return "NULL", nil diff --git a/pkg/planner/node_ingest.go b/pkg/planner/node_ingest.go new file mode 100644 index 00000000..f592433d --- /dev/null +++ b/pkg/planner/node_ingest.go @@ -0,0 +1,379 @@ +package planner + +import ( + "context" + "database/sql" + "errors" + "fmt" + "strings" + "sync" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/google/uuid" + "github.com/hugr-lab/query-engine/pkg/auth" + "github.com/hugr-lab/query-engine/pkg/catalog" + "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" + "github.com/hugr-lab/query-engine/pkg/catalog/sdl" + "github.com/hugr-lab/query-engine/pkg/db" + "github.com/hugr-lab/query-engine/pkg/engines" + "github.com/hugr-lab/query-engine/pkg/perm" + "github.com/vektah/gqlparser/v2/ast" +) + +const ingestViewNamePrefix = "_hugr_ingest_" + +type ingestColumn struct { + ArrowField arrow.Field + Field *sdl.Field + FieldDef *ast.FieldDefinition + InputDef *ast.FieldDefinition +} + +type ingestExecState struct { + reader array.RecordReader + view string + arrow *db.Arrow + release func() + once sync.Once + err error +} + +func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, reader array.RecordReader) (*QueryPlanNode, func() error, error) { + if dataObject == "" { + return nil, nil, fmt.Errorf("missing data object") + } + if reader == nil { + return nil, nil, fmt.Errorf("missing arrow reader") + } + + info, mutationField, err := resolveIngestTarget(ctx, provider, dataObject) + if err != nil { + return nil, nil, err + } + engine, err := planner.Engine(info.Catalog) + if err != nil { + return nil, nil, err + } + if caps := engine.Capabilities(); caps == nil || !caps.Insert.Insert { + return nil, nil, fmt.Errorf("engine %q does not support insert", engine.Type()) + } + mutation := sdl.MutationInfo(ctx, provider, mutationField) + if mutation == nil || mutation.Type != sdl.MutationTypeInsert { + return nil, nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) + } + + columns, err := resolveIngestColumns(ctx, provider, info, mutation, reader.Schema()) + if err != nil { + return nil, nil, err + } + if len(columns) == 0 { + return nil, nil, fmt.Errorf("no insertable columns matched between arrow stream and data object") + } + permissionData, err := checkIngestPermissions(ctx, provider, info, mutationField, columns) + if err != nil { + return nil, nil, err + } + + state := &ingestExecState{ + reader: reader, + view: ingestViewNamePrefix + strings.ReplaceAll(uuid.NewString(), "-", ""), + } + node := ingestNode(ctx, info, mutation, engine, columns, permissionData, state) + return node, state.cancel, nil +} + +func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { + var def *ast.Definition + if strings.Contains(dataObject, ".") { + queryDef := provider.ForName(ctx, base.QueryBaseName) + if queryDef == nil { + return nil, nil, fmt.Errorf("query base type not found in schema") + } + cur := queryDef + for _, part := range strings.Split(dataObject, ".") { + f := cur.Fields.ForName(part) + if f == nil { + return nil, nil, fmt.Errorf("data object %q: segment %q not found", dataObject, part) + } + cur = provider.ForName(ctx, f.Type.Name()) + if cur == nil { + return nil, nil, fmt.Errorf("data object %q: type %q not found", dataObject, f.Type.Name()) + } + } + def = cur + } else { + def = provider.ForName(ctx, dataObject) + } + if def == nil { + return nil, nil, fmt.Errorf("data object %q not found in schema", dataObject) + } + if !sdl.IsDataObject(def) { + return nil, nil, fmt.Errorf("%q is not a data object", dataObject) + } + info := sdl.DataObjectInfo(def) + if info == nil { + return nil, nil, fmt.Errorf("data object %q: no info", dataObject) + } + if info.Type != sdl.TableDataObject { + return nil, nil, fmt.Errorf("data object %q is not a table (got %q): only tables are ingestable", dataObject, info.Type) + } + if info.Catalog == "" { + return nil, nil, fmt.Errorf("data object %q has no catalog", dataObject) + } + _, mutationField := sdl.ObjectMutationDefinition(ctx, provider, def, sdl.MutationTypeInsert) + if mutationField == nil { + return nil, nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) + } + return info, mutationField, nil +} + +func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutation *sdl.Mutation, schema *arrow.Schema) ([]ingestColumn, error) { + if schema == nil { + return nil, fmt.Errorf("arrow stream has no schema") + } + inputName := info.InputInsertDataName() + if inputName == "" { + return nil, fmt.Errorf("data object %q has no insert input type", info.Name) + } + input := provider.ForName(ctx, inputName) + if input == nil { + return nil, fmt.Errorf("insert input type %q not found", inputName) + } + + seen := map[string]struct{}{} + byName := make(map[string]ingestColumn, schema.NumFields()) + for _, af := range schema.Fields() { + if _, dup := seen[af.Name]; dup { + return nil, fmt.Errorf("duplicate arrow column %q", af.Name) + } + seen[af.Name] = struct{}{} + + inputField := input.Fields.ForName(af.Name) + if inputField == nil { + return nil, fmt.Errorf("column %q is not defined in insert input %q", af.Name, inputName) + } + objectField := info.Definition().Fields.ForName(af.Name) + if objectField == nil { + return nil, fmt.Errorf("column %q is not defined in data object %q", af.Name, info.Definition().Name) + } + fieldInfo := info.FieldForName(af.Name) + if fieldInfo == nil { + return nil, fmt.Errorf("column %q is not defined in data object %q", af.Name, info.Definition().Name) + } + if fieldInfo.IsReferencesSubquery() { + return nil, fmt.Errorf("column %q is a reference and cannot be ingested directly", af.Name) + } + if fieldInfo.IsNotDBField() { + return nil, fmt.Errorf("column %q is a computed/virtual field and cannot be ingested", af.Name) + } + if fieldInfo.FieldSourceName("", false) == "-" { + return nil, fmt.Errorf("column %q has no database mapping", af.Name) + } + byName[af.Name] = ingestColumn{ + ArrowField: af, + Field: fieldInfo, + FieldDef: objectField, + InputDef: inputField, + } + } + + for _, fieldInfo := range mutation.Fields() { + if _, ok := byName[fieldInfo.Name]; ok { + continue + } + if !fieldInfo.IsRequired() { + continue + } + if fieldInfo.SequenceName() != "" || mutation.FieldHasDefaultInsertExpr(fieldInfo.Name) { + continue + } + if fd := info.Definition().Fields.ForName(fieldInfo.Name); fd != nil && + fd.Directives.ForName(base.FieldDefaultDirectiveName) != nil { + continue + } + return nil, fmt.Errorf("field %q is required for ingest into %q", fieldInfo.Name, info.Name) + } + + columns := make([]ingestColumn, 0, len(byName)) + for _, af := range schema.Fields() { + columns = append(columns, byName[af.Name]) + } + return columns, nil +} + +func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutationField *ast.FieldDefinition, columns []ingestColumn) (map[string]any, error) { + if auth.IsFullAccess(ctx) { + return nil, nil + } + rp := perm.PermissionsFromCtx(ctx) + if rp == nil { + return nil, nil + } + if rp.Disabled { + return nil, auth.ErrForbidden + } + + parent := sdl.ModuleTypeName(sdl.ObjectModule(info.Definition()), sdl.ModuleMutation) + if _, ok := rp.Enabled(parent, mutationField.Name); !ok { + return nil, auth.ErrForbidden + } + + data := make(map[string]any, len(columns)) + for _, c := range columns { + data[c.InputDef.Name] = nil + } + var permissionData map[string]any + if arg := rp.DataArgument(ctx, parent, mutationField.Name); arg != nil { + values, err := sdl.ParseDataAsInputObject(ctx, provider, &ast.Type{ + NamedType: info.InputInsertDataName(), + Position: base.CompiledPos("ingest permission data"), + }, arg, false) + if err != nil { + return nil, err + } + if values != nil { + permissionData = values.(map[string]any) + for k, v := range permissionData { + data[k] = v + } + } + } + if err := rp.CheckMutationInput(ctx, provider, info.InputInsertDataName(), data); err != nil { + return nil, err + } + return permissionData, nil +} + +func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.Engine, columns []ingestColumn, permissionData map[string]any, state *ingestExecState) *QueryPlanNode { + needsSpatial := ingestNeedsSpatial(columns) + return &QueryPlanNode{ + Name: "ingest_" + info.Name, + Before: func(ctx context.Context, pool *db.Pool, node *QueryPlanNode) error { + ar, err := pool.Arrow(ctx) + if err != nil { + return fmt.Errorf("acquire duckdb arrow conn: %w", err) + } + if needsSpatial { + if _, err := ar.Exec(ctx, "LOAD spatial; CALL register_geoarrow_extensions()"); err != nil { + _ = ar.Close() + return fmt.Errorf("prepare spatial arrow ingest: %w", err) + } + } + release, err := ar.RegisterView(state.reader, state.view) + if err != nil { + _ = ar.Close() + return fmt.Errorf("register arrow view: %w", err) + } + state.arrow = ar + state.release = release + node.plan.exec = func(ctx context.Context, query string, args ...any) (sql.Result, error) { + if len(args) != 0 { + return nil, fmt.Errorf("ingest execution does not support SQL parameters") + } + res, err := ar.Exec(ctx, query) + if err != nil { + return nil, err + } + return res, nil + } + return nil + }, + After: func(ctx context.Context, pool *db.Pool, node *QueryPlanNode) error { + return state.cancel() + }, + CollectFunc: func(node *QueryPlanNode, children Results, params []any) (string, []any, error) { + fieldValues := make(map[string]string, len(columns)) + for _, c := range columns { + value := engines.Ident(c.ArrowField.Name) + field := &ast.Field{ + Name: c.Field.Name, + Alias: c.Field.Name, + Definition: c.FieldDef, + ObjectDefinition: info.Definition(), + } + var err error + if caster, ok := engine.(engines.EngineTypeCaster); ok { + value, err = caster.CastArrowIngestValue(field, c.ArrowField, value) + } else { + value, err = engines.CastArrowIngestValueToDuckDB(field, c.ArrowField, value) + } + if err != nil { + return "", nil, err + } + fieldValues[c.Field.Name] = value + } + for name, value := range permissionData { + fieldInfo := info.FieldForName(name) + if fieldInfo == nil { + return "", nil, fmt.Errorf("permission data field %q is not defined in data object %q", name, info.Name) + } + if fieldInfo.IsReferencesSubquery() || fieldInfo.IsNotDBField() { + return "", nil, fmt.Errorf("permission data field %q cannot be ingested directly", name) + } + sqlValue, err := engine.SQLValue(value) + if err != nil { + return "", nil, err + } + fieldValues[name] = sqlValue + } + if err := mutation.AppendInsertSQLExpression(fieldValues, perm.AuthVars(ctx), engine); err != nil { + return "", nil, err + } + + var targetFields, selectExprs []string + for _, c := range columns { + targetFields = append(targetFields, c.Field.FieldSourceName("", true)) + selectExprs = append(selectExprs, fieldValues[c.Field.Name]) + delete(fieldValues, c.Field.Name) + } + for _, fieldInfo := range mutation.Fields() { + expr, ok := fieldValues[fieldInfo.Name] + if !ok { + continue + } + if fieldInfo.FieldSourceName("", false) == "-" { + continue + } + targetFields = append(targetFields, fieldInfo.FieldSourceName("", true)) + selectExprs = append(selectExprs, expr) + delete(fieldValues, fieldInfo.Name) + } + if len(targetFields) == 0 { + return "", nil, fmt.Errorf("no values provided for ingest") + } + + target := info.SQL(ctx, engines.Ident(info.Catalog)) + return fmt.Sprintf("INSERT INTO %s (%s) SELECT %s FROM %s", + target, + strings.Join(targetFields, ", "), + strings.Join(selectExprs, ", "), + engines.Ident(state.view), + ), params, nil + }, + } +} + +func ingestNeedsSpatial(columns []ingestColumn) bool { + for _, c := range columns { + if c.FieldDef != nil && c.FieldDef.Type.Name() == base.GeometryTypeName { + return true + } + } + return false +} + +func (s *ingestExecState) cancel() error { + s.once.Do(func() { + if s.release != nil { + s.release() + } + if s.arrow != nil { + s.err = s.arrow.Close() + } + }) + if errors.Is(s.err, sql.ErrConnDone) { + return nil + } + return s.err +} diff --git a/pkg/planner/plan.go b/pkg/planner/plan.go index d04c0979..fc5c182c 100644 --- a/pkg/planner/plan.go +++ b/pkg/planner/plan.go @@ -2,13 +2,14 @@ package planner import ( "context" + "database/sql" "errors" + "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" "github.com/hugr-lab/query-engine/pkg/db" "github.com/hugr-lab/query-engine/pkg/engines" - "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/types" "github.com/vektah/gqlparser/v2/ast" ) @@ -19,6 +20,8 @@ type QueryPlan struct { CompiledQuery string Params []any + + exec QueryPlanExecFunc } func (p *QueryPlan) Compile() error { @@ -42,6 +45,14 @@ func (p *QueryPlan) Execute(ctx context.Context, db *db.Pool) (data interface{}, return nil, err } } + if p.RootNode.After != nil { + defer func() { + afterErr := p.RootNode.After(ctx, db, p.RootNode) + if err == nil { + err = afterErr + } + }() + } switch { case sdl.IsScalarType(p.Query.Definition.Type.Name()) && @@ -82,6 +93,9 @@ func (p *QueryPlan) ExecuteStream(ctx context.Context, db *db.Pool) (types.Arrow tbl, done, err := db.QueryTableStream(ctx, p.CompiledQuery, p.Params...) if err != nil { + if p.RootNode.After != nil { + _ = p.RootNode.After(ctx, db, p.RootNode) + } return nil, nil, err } if tbl != nil { @@ -89,9 +103,43 @@ func (p *QueryPlan) ExecuteStream(ctx context.Context, db *db.Pool) (types.Arrow tbl.SetGeometryInfo(gi) } } + if p.RootNode.After != nil { + originalDone := done + done = func() { + if originalDone != nil { + originalDone() + } + _ = p.RootNode.After(ctx, db, p.RootNode) + } + } return tbl, done, nil } +func (p *QueryPlan) ExecuteExec(ctx context.Context, db *db.Pool) (res sql.Result, err error) { + if p.CompiledQuery == "" { + return nil, errors.New("no compiled query") + } + if p.RootNode.Before != nil { + err = p.RootNode.Before(ctx, db, p.RootNode) + if err != nil { + return nil, err + } + } + if p.RootNode.After != nil { + defer func() { + afterErr := p.RootNode.After(ctx, db, p.RootNode) + if err == nil { + err = afterErr + } + }() + } + exec := p.exec + if exec == nil { + exec = db.Exec + } + return exec(ctx, p.CompiledQuery, p.Params...) +} + func (p *QueryPlan) Log() string { return p.CompiledQuery } @@ -109,6 +157,7 @@ type QueryPlanNode struct { Parent *QueryPlanNode Before NodeBeforeExecFunc + After NodeAfterExecFunc provider catalog.Provider engines Catalog @@ -118,6 +167,8 @@ type QueryPlanNode struct { } type NodeBeforeExecFunc func(ctx context.Context, db *db.Pool, node *QueryPlanNode) error +type NodeAfterExecFunc func(ctx context.Context, db *db.Pool, node *QueryPlanNode) error +type QueryPlanExecFunc func(ctx context.Context, query string, args ...any) (sql.Result, error) type QueryPlanNodes []*QueryPlanNode diff --git a/pkg/planner/planer.go b/pkg/planner/planer.go index 806d8f33..61fac33d 100644 --- a/pkg/planner/planer.go +++ b/pkg/planner/planer.go @@ -4,9 +4,10 @@ import ( "context" "errors" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" "github.com/hugr-lab/query-engine/pkg/engines" - "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/types" "github.com/vektah/gqlparser/v2/ast" ) @@ -65,3 +66,18 @@ func (s *Service) Plan(ctx context.Context, provider catalog.Provider, query *as return &QueryPlan{Query: query, RootNode: node}, nil } + +func (s *Service) PlanIngest(ctx context.Context, provider catalog.Provider, dataObject string, reader array.RecordReader) (*QueryPlan, func() error, error) { + node, cancel, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) + if err != nil { + if cancel != nil { + _ = cancel() + } + return nil, nil, err + } + node.provider = provider + node.engines = s.engines + node.querier = s.querier + + return &QueryPlan{RootNode: node}, cancel, nil +} From 215f2b0ac61118f0430e6b8c98e1ebfd5f3418c3 Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 6 Jun 2026 20:41:27 +0400 Subject: [PATCH 10/36] ipc ingest --- pkg/catalog/compiler/base/options.go | 1 + pkg/engines/duckdb.go | 1 + pkg/engines/iceberg.go | 1 + pkg/engines/postgres.go | 1 + pkg/planner/node_ingest.go | 4 ++-- 5 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/catalog/compiler/base/options.go b/pkg/catalog/compiler/base/options.go index 8f007056..b0598537 100644 --- a/pkg/catalog/compiler/base/options.go +++ b/pkg/catalog/compiler/base/options.go @@ -91,6 +91,7 @@ type EngineCapabilities struct { type EngineInsertCapabilities struct { Insert bool + Ingest bool Returning bool InsertReferences bool } diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index bc516925..3f5340d6 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -69,6 +69,7 @@ func (e *DuckDB) Capabilities() *compiler.EngineCapabilities { }, Insert: compiler.EngineInsertCapabilities{ Insert: true, + Ingest: true, Returning: true, InsertReferences: true, }, diff --git a/pkg/engines/iceberg.go b/pkg/engines/iceberg.go index 6cfe49cc..da67b53b 100644 --- a/pkg/engines/iceberg.go +++ b/pkg/engines/iceberg.go @@ -33,6 +33,7 @@ func (e *Iceberg) Capabilities() *compiler.EngineCapabilities { caps.General.SupportTimeTravel = true // DuckDB Iceberg extension doesn't support targeted inserts (INSERT INTO tbl(col1,col2) VALUES ...) caps.Insert.Insert = false + caps.Insert.Ingest = false caps.Insert.Returning = false caps.Insert.InsertReferences = false return &caps diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 55f7e5e2..5a06e63e 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -45,6 +45,7 @@ func (e *Postgres) Capabilities() *compiler.EngineCapabilities { }, Insert: compiler.EngineInsertCapabilities{ Insert: true, + Ingest: true, Returning: true, InsertReferences: true, }, diff --git a/pkg/planner/node_ingest.go b/pkg/planner/node_ingest.go index f592433d..ff4a78e1 100644 --- a/pkg/planner/node_ingest.go +++ b/pkg/planner/node_ingest.go @@ -55,8 +55,8 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if err != nil { return nil, nil, err } - if caps := engine.Capabilities(); caps == nil || !caps.Insert.Insert { - return nil, nil, fmt.Errorf("engine %q does not support insert", engine.Type()) + if caps := engine.Capabilities(); caps == nil || !caps.Insert.Ingest { + return nil, nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) } mutation := sdl.MutationInfo(ctx, provider, mutationField) if mutation == nil || mutation.Type != sdl.MutationTypeInsert { From c70c82068fb9c3840b2bee0cf0f7fda72f029348 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 7 Jun 2026 12:21:55 +0400 Subject: [PATCH 11/36] ipc ingest --- ipc-ingest.go | 2 +- pkg/planner/planer.go | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ipc-ingest.go b/ipc-ingest.go index 983fa69e..a5db356d 100644 --- a/ipc-ingest.go +++ b/ipc-ingest.go @@ -75,7 +75,7 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { } defer reader.Release() - plan, cancel, err := s.planner.PlanIngest(ctx, s.schema.Provider(), dataObject, reader) + plan, cancel, err := s.planner.PlanArrowIngest(ctx, s.schema.Provider(), dataObject, reader) if cancel != nil { defer func() { _ = cancel() }() } diff --git a/pkg/planner/planer.go b/pkg/planner/planer.go index 61fac33d..804ceb58 100644 --- a/pkg/planner/planer.go +++ b/pkg/planner/planer.go @@ -67,7 +67,10 @@ func (s *Service) Plan(ctx context.Context, provider catalog.Provider, query *as return &QueryPlan{Query: query, RootNode: node}, nil } -func (s *Service) PlanIngest(ctx context.Context, provider catalog.Provider, dataObject string, reader array.RecordReader) (*QueryPlan, func() error, error) { +// PlanArrowIngest builds an INSERT-from-Arrow-view plan for the target data object. +// The Arrow reader is part of this planning API because its schema drives column +// resolution and ingest casting, while execution registers it as a temporary view. +func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, reader array.RecordReader) (*QueryPlan, func() error, error) { node, cancel, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) if err != nil { if cancel != nil { From 6a03e9e6f0fd918a3d27b20183b27f33cbe6acaf Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 7 Jun 2026 12:30:33 +0400 Subject: [PATCH 12/36] ipc ingest --- pkg/planner/{node_ingest.go => node_arrow_ingest.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pkg/planner/{node_ingest.go => node_arrow_ingest.go} (100%) diff --git a/pkg/planner/node_ingest.go b/pkg/planner/node_arrow_ingest.go similarity index 100% rename from pkg/planner/node_ingest.go rename to pkg/planner/node_arrow_ingest.go From 45cad1558edc585fa0233c2b88b73938cb2a6af9 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 7 Jun 2026 13:30:54 +0400 Subject: [PATCH 13/36] ipc ingest --- ipc-ingest.go | 11 ++-- pkg/db/pool.go | 68 ++++++++++++++++---- pkg/planner/node_arrow_ingest.go | 105 ++++--------------------------- pkg/planner/plan.go | 52 --------------- pkg/planner/planer.go | 11 ++-- 5 files changed, 79 insertions(+), 168 deletions(-) diff --git a/ipc-ingest.go b/ipc-ingest.go index a5db356d..7432d44b 100644 --- a/ipc-ingest.go +++ b/ipc-ingest.go @@ -75,10 +75,7 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { } defer reader.Release() - plan, cancel, err := s.planner.PlanArrowIngest(ctx, s.schema.Provider(), dataObject, reader) - if cancel != nil { - defer func() { _ = cancel() }() - } + plan, err := s.planner.PlanArrowIngest(ctx, s.schema.Provider(), dataObject, reader) if err != nil { if errors.Is(err, auth.ErrForbidden) { writeIngestError(w, http.StatusForbidden, err.Error()) @@ -92,7 +89,11 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { return } - res, err := plan.ExecuteExec(ctx, s.db) + if len(plan.Params) != 0 { + writeIngestError(w, http.StatusInternalServerError, "arrow ingest plan produced SQL parameters") + return + } + res, err := s.db.ExecWithArrowView(ctx, reader, plan.CompiledQuery) if err != nil { writeIngestError(w, http.StatusInternalServerError, err.Error()) return diff --git a/pkg/db/pool.go b/pkg/db/pool.go index a0d88df0..1ae7376d 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -8,9 +8,15 @@ import ( "strings" "sync" + "github.com/apache/arrow-go/v18/arrow/array" "github.com/duckdb/duckdb-go/v2" ) +// TempArrowViewName is the fixed per-connection view name used by +// ExecWithArrowView. DuckDB views registered from Arrow readers are scoped to +// the driver connection, so a stable name is safe across concurrent requests. +const TempArrowViewName = "_hugr_arrow_view" + type Config struct { Path string `json:"path"` MaxOpenConns int `json:"max_open_conns"` @@ -221,6 +227,56 @@ func (p *Pool) Arrow(ctx context.Context) (*Arrow, error) { }, nil } +// ExecWithArrowView registers reader as TempArrowViewName and executes query on +// the same DuckDB driver connection, where the temporary Arrow view is visible. +func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, query string) (sql.Result, error) { + if reader == nil { + return nil, fmt.Errorf("missing arrow reader") + } + ar, err := p.Arrow(ctx) + if err != nil { + return nil, err + } + defer ar.Close() + + execer, ok := ar.drv.(driver.ExecerContext) + if !ok { + return nil, fmt.Errorf("duckdb driver connection does not implement ExecerContext") + } + if arrowViewNeedsSpatial(reader) { + if _, err := execer.ExecContext(ctx, "LOAD spatial; CALL register_geoarrow_extensions()", nil); err != nil { + return nil, fmt.Errorf("prepare spatial arrow view: %w", err) + } + } + release, err := ar.RegisterView(reader, TempArrowViewName) + if err != nil { + return nil, fmt.Errorf("register arrow view: %w", err) + } + defer release() + + return execer.ExecContext(ctx, query, nil) +} + +func arrowViewNeedsSpatial(reader array.RecordReader) bool { + if reader == nil || reader.Schema() == nil { + return false + } + for _, f := range reader.Schema().Fields() { + if ext, ok := f.Metadata.GetValue("ARROW:extension:name"); ok && isGeometryArrowExtension(ext) { + return true + } + if ext, ok := f.Metadata.GetValue("extension:name"); ok && isGeometryArrowExtension(ext) { + return true + } + } + return false +} + +func isGeometryArrowExtension(ext string) bool { + ext = strings.ToLower(ext) + return strings.HasPrefix(ext, "geoarrow.") || ext == "hugr.geojson" || ext == "geojson" +} + func (p *Pool) RegisterScalarFunction(ctx context.Context, function ScalarFunction) error { return RegisterScalarFunction(ctx, p, function) } @@ -279,18 +335,6 @@ type Arrow struct { release func() } -// Exec runs a statement on the same DuckDB driver connection that backs the -// embedded *duckdb.Arrow. This lets callers RegisterView an Arrow stream and -// then INSERT/UPDATE against the registered view on the same connection — the -// view is per-connection and is not visible to other pool connections. -func (a *Arrow) Exec(ctx context.Context, query string) (driver.Result, error) { - execer, ok := a.drv.(driver.ExecerContext) - if !ok { - return nil, fmt.Errorf("duckdb driver connection does not implement ExecerContext") - } - return execer.ExecContext(ctx, query, nil) -} - func (a *Arrow) Close() error { defer a.release() return a.drv.Close() diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index ff4a78e1..77cd789f 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -2,15 +2,11 @@ package planner import ( "context" - "database/sql" - "errors" "fmt" "strings" - "sync" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" - "github.com/google/uuid" "github.com/hugr-lab/query-engine/pkg/auth" "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" @@ -21,8 +17,6 @@ import ( "github.com/vektah/gqlparser/v2/ast" ) -const ingestViewNamePrefix = "_hugr_ingest_" - type ingestColumn struct { ArrowField arrow.Field Field *sdl.Field @@ -30,57 +24,42 @@ type ingestColumn struct { InputDef *ast.FieldDefinition } -type ingestExecState struct { - reader array.RecordReader - view string - arrow *db.Arrow - release func() - once sync.Once - err error -} - -func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, reader array.RecordReader) (*QueryPlanNode, func() error, error) { +func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, reader array.RecordReader) (*QueryPlanNode, error) { if dataObject == "" { - return nil, nil, fmt.Errorf("missing data object") + return nil, fmt.Errorf("missing data object") } if reader == nil { - return nil, nil, fmt.Errorf("missing arrow reader") + return nil, fmt.Errorf("missing arrow reader") } info, mutationField, err := resolveIngestTarget(ctx, provider, dataObject) if err != nil { - return nil, nil, err + return nil, err } engine, err := planner.Engine(info.Catalog) if err != nil { - return nil, nil, err + return nil, err } if caps := engine.Capabilities(); caps == nil || !caps.Insert.Ingest { - return nil, nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) + return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) } mutation := sdl.MutationInfo(ctx, provider, mutationField) if mutation == nil || mutation.Type != sdl.MutationTypeInsert { - return nil, nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) + return nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) } columns, err := resolveIngestColumns(ctx, provider, info, mutation, reader.Schema()) if err != nil { - return nil, nil, err + return nil, err } if len(columns) == 0 { - return nil, nil, fmt.Errorf("no insertable columns matched between arrow stream and data object") + return nil, fmt.Errorf("no insertable columns matched between arrow stream and data object") } permissionData, err := checkIngestPermissions(ctx, provider, info, mutationField, columns) if err != nil { - return nil, nil, err - } - - state := &ingestExecState{ - reader: reader, - view: ingestViewNamePrefix + strings.ReplaceAll(uuid.NewString(), "-", ""), + return nil, err } - node := ingestNode(ctx, info, mutation, engine, columns, permissionData, state) - return node, state.cancel, nil + return ingestNode(ctx, info, mutation, engine, columns, permissionData), nil } func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { @@ -245,43 +224,9 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info return permissionData, nil } -func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.Engine, columns []ingestColumn, permissionData map[string]any, state *ingestExecState) *QueryPlanNode { - needsSpatial := ingestNeedsSpatial(columns) +func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.Engine, columns []ingestColumn, permissionData map[string]any) *QueryPlanNode { return &QueryPlanNode{ Name: "ingest_" + info.Name, - Before: func(ctx context.Context, pool *db.Pool, node *QueryPlanNode) error { - ar, err := pool.Arrow(ctx) - if err != nil { - return fmt.Errorf("acquire duckdb arrow conn: %w", err) - } - if needsSpatial { - if _, err := ar.Exec(ctx, "LOAD spatial; CALL register_geoarrow_extensions()"); err != nil { - _ = ar.Close() - return fmt.Errorf("prepare spatial arrow ingest: %w", err) - } - } - release, err := ar.RegisterView(state.reader, state.view) - if err != nil { - _ = ar.Close() - return fmt.Errorf("register arrow view: %w", err) - } - state.arrow = ar - state.release = release - node.plan.exec = func(ctx context.Context, query string, args ...any) (sql.Result, error) { - if len(args) != 0 { - return nil, fmt.Errorf("ingest execution does not support SQL parameters") - } - res, err := ar.Exec(ctx, query) - if err != nil { - return nil, err - } - return res, nil - } - return nil - }, - After: func(ctx context.Context, pool *db.Pool, node *QueryPlanNode) error { - return state.cancel() - }, CollectFunc: func(node *QueryPlanNode, children Results, params []any) (string, []any, error) { fieldValues := make(map[string]string, len(columns)) for _, c := range columns { @@ -348,32 +293,8 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e target, strings.Join(targetFields, ", "), strings.Join(selectExprs, ", "), - engines.Ident(state.view), + engines.Ident(db.TempArrowViewName), ), params, nil }, } } - -func ingestNeedsSpatial(columns []ingestColumn) bool { - for _, c := range columns { - if c.FieldDef != nil && c.FieldDef.Type.Name() == base.GeometryTypeName { - return true - } - } - return false -} - -func (s *ingestExecState) cancel() error { - s.once.Do(func() { - if s.release != nil { - s.release() - } - if s.arrow != nil { - s.err = s.arrow.Close() - } - }) - if errors.Is(s.err, sql.ErrConnDone) { - return nil - } - return s.err -} diff --git a/pkg/planner/plan.go b/pkg/planner/plan.go index fc5c182c..a0a0fc5a 100644 --- a/pkg/planner/plan.go +++ b/pkg/planner/plan.go @@ -2,7 +2,6 @@ package planner import ( "context" - "database/sql" "errors" "github.com/hugr-lab/query-engine/pkg/catalog" @@ -20,8 +19,6 @@ type QueryPlan struct { CompiledQuery string Params []any - - exec QueryPlanExecFunc } func (p *QueryPlan) Compile() error { @@ -45,15 +42,6 @@ func (p *QueryPlan) Execute(ctx context.Context, db *db.Pool) (data interface{}, return nil, err } } - if p.RootNode.After != nil { - defer func() { - afterErr := p.RootNode.After(ctx, db, p.RootNode) - if err == nil { - err = afterErr - } - }() - } - switch { case sdl.IsScalarType(p.Query.Definition.Type.Name()) && p.Query.Definition.Type.NamedType == "": @@ -93,9 +81,6 @@ func (p *QueryPlan) ExecuteStream(ctx context.Context, db *db.Pool) (types.Arrow tbl, done, err := db.QueryTableStream(ctx, p.CompiledQuery, p.Params...) if err != nil { - if p.RootNode.After != nil { - _ = p.RootNode.After(ctx, db, p.RootNode) - } return nil, nil, err } if tbl != nil { @@ -103,43 +88,9 @@ func (p *QueryPlan) ExecuteStream(ctx context.Context, db *db.Pool) (types.Arrow tbl.SetGeometryInfo(gi) } } - if p.RootNode.After != nil { - originalDone := done - done = func() { - if originalDone != nil { - originalDone() - } - _ = p.RootNode.After(ctx, db, p.RootNode) - } - } return tbl, done, nil } -func (p *QueryPlan) ExecuteExec(ctx context.Context, db *db.Pool) (res sql.Result, err error) { - if p.CompiledQuery == "" { - return nil, errors.New("no compiled query") - } - if p.RootNode.Before != nil { - err = p.RootNode.Before(ctx, db, p.RootNode) - if err != nil { - return nil, err - } - } - if p.RootNode.After != nil { - defer func() { - afterErr := p.RootNode.After(ctx, db, p.RootNode) - if err == nil { - err = afterErr - } - }() - } - exec := p.exec - if exec == nil { - exec = db.Exec - } - return exec(ctx, p.CompiledQuery, p.Params...) -} - func (p *QueryPlan) Log() string { return p.CompiledQuery } @@ -157,7 +108,6 @@ type QueryPlanNode struct { Parent *QueryPlanNode Before NodeBeforeExecFunc - After NodeAfterExecFunc provider catalog.Provider engines Catalog @@ -167,8 +117,6 @@ type QueryPlanNode struct { } type NodeBeforeExecFunc func(ctx context.Context, db *db.Pool, node *QueryPlanNode) error -type NodeAfterExecFunc func(ctx context.Context, db *db.Pool, node *QueryPlanNode) error -type QueryPlanExecFunc func(ctx context.Context, query string, args ...any) (sql.Result, error) type QueryPlanNodes []*QueryPlanNode diff --git a/pkg/planner/planer.go b/pkg/planner/planer.go index 804ceb58..be9353c1 100644 --- a/pkg/planner/planer.go +++ b/pkg/planner/planer.go @@ -70,17 +70,14 @@ func (s *Service) Plan(ctx context.Context, provider catalog.Provider, query *as // PlanArrowIngest builds an INSERT-from-Arrow-view plan for the target data object. // The Arrow reader is part of this planning API because its schema drives column // resolution and ingest casting, while execution registers it as a temporary view. -func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, reader array.RecordReader) (*QueryPlan, func() error, error) { - node, cancel, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) +func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, reader array.RecordReader) (*QueryPlan, error) { + node, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) if err != nil { - if cancel != nil { - _ = cancel() - } - return nil, nil, err + return nil, err } node.provider = provider node.engines = s.engines node.querier = s.querier - return &QueryPlan{RootNode: node}, cancel, nil + return &QueryPlan{RootNode: node}, nil } From 6763dd233079fd196a5b1d37dd6c4ef89dd62e46 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 7 Jun 2026 15:47:03 +0400 Subject: [PATCH 14/36] ipc ingest --- ipc-ingest.go | 2 +- pkg/db/pool.go | 4 +-- pkg/engines/duckdb.go | 9 ++++-- pkg/engines/engines.go | 4 +++ pkg/engines/postgres.go | 51 ++++++++++++++++++++++++++++---- pkg/planner/node_arrow_ingest.go | 49 ++++++++++++++++++++++-------- pkg/planner/plan.go | 2 ++ pkg/planner/planer.go | 4 +-- 8 files changed, 101 insertions(+), 24 deletions(-) diff --git a/ipc-ingest.go b/ipc-ingest.go index 7432d44b..4f04774d 100644 --- a/ipc-ingest.go +++ b/ipc-ingest.go @@ -93,7 +93,7 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { writeIngestError(w, http.StatusInternalServerError, "arrow ingest plan produced SQL parameters") return } - res, err := s.db.ExecWithArrowView(ctx, reader, plan.CompiledQuery) + res, err := s.db.ExecWithArrowView(ctx, reader, plan.CompiledQuery, plan.RequiresSpatial) if err != nil { writeIngestError(w, http.StatusInternalServerError, err.Error()) return diff --git a/pkg/db/pool.go b/pkg/db/pool.go index 1ae7376d..fc85e4ad 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -229,7 +229,7 @@ func (p *Pool) Arrow(ctx context.Context) (*Arrow, error) { // ExecWithArrowView registers reader as TempArrowViewName and executes query on // the same DuckDB driver connection, where the temporary Arrow view is visible. -func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, query string) (sql.Result, error) { +func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, query string, requiresSpatial bool) (sql.Result, error) { if reader == nil { return nil, fmt.Errorf("missing arrow reader") } @@ -243,7 +243,7 @@ func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, if !ok { return nil, fmt.Errorf("duckdb driver connection does not implement ExecerContext") } - if arrowViewNeedsSpatial(reader) { + if requiresSpatial || arrowViewNeedsSpatial(reader) { if _, err := execer.ExecContext(ctx, "LOAD spatial; CALL register_geoarrow_extensions()", nil); err != nil { return nil, fmt.Errorf("prepare spatial arrow view: %w", err) } diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index 3f5340d6..7f515b8b 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -46,8 +46,9 @@ var scalarJSONInfo = map[string]jsonTypeInfo{ } var ( - _ Engine = &DuckDB{} - _ EngineAggregator = &DuckDB{} + _ Engine = &DuckDB{} + _ EngineArrowIngestCaster = &DuckDB{} + _ EngineAggregator = &DuckDB{} ) type DuckDB struct { @@ -85,6 +86,10 @@ func (e *DuckDB) Capabilities() *compiler.EngineCapabilities { } } +func (e *DuckDB) CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { + return CastArrowIngestValueToDuckDB(field, arrowField, sql) +} + func CastArrowIngestValueToDuckDB(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { if field == nil || field.Definition == nil { return sql, nil diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index 8041b8dd..60a56ca8 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -71,6 +71,10 @@ type EngineTypeCaster interface { Engine ToIntermediateType(*ast.Field) (string, error) CastFromIntermediateType(field *ast.Field, toJSON bool) (string, error) +} + +type EngineArrowIngestCaster interface { + Engine CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) } diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 5a06e63e..4d7c0f64 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -21,10 +21,11 @@ import ( ) var ( - _ Engine = &Postgres{} - _ EngineQueryScanner = &Postgres{} - _ EngineTypeCaster = &Postgres{} - _ EngineAggregator = &Postgres{} + _ Engine = &Postgres{} + _ EngineQueryScanner = &Postgres{} + _ EngineTypeCaster = &Postgres{} + _ EngineArrowIngestCaster = &Postgres{} + _ EngineAggregator = &Postgres{} ) type Postgres struct { @@ -595,7 +596,47 @@ func (e *Postgres) CastFromIntermediateType(f *ast.Field, toJSON bool) (string, } func (e *Postgres) CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { - return CastArrowIngestValueToDuckDB(field, arrowField, sql) + return CastArrowIngestValueToPostgres(field, arrowField, sql) +} + +func CastArrowIngestValueToPostgres(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { + if field == nil || field.Definition == nil { + return sql, nil + } + switch field.Definition.Type.Name() { + case base.JSONTypeName: + return CastArrowIngestValueToDuckDB(field, arrowField, sql) + case base.GeometryTypeName: + return castArrowGeometryToPostgres(arrowField, sql) + default: + return sql, nil + } +} + +func castArrowGeometryToPostgres(field arrow.Field, sql string) (string, error) { + switch arrowExtensionName(field) { + case "geoarrow.wkb": + return "ST_AsText(ST_GeomFromWKB(" + sql + "))", nil + case "geoarrow.wkt": + return sql, nil + case "hugr.geojson", "geoarrow.geojson", "geojson": + return "ST_AsText(ST_GeomFromGeoJSON(" + sql + "))", nil + case "geoarrow.point", "geoarrow.linestring", "geoarrow.polygon", + "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", + "geoarrow.geometry", "geoarrow.geometrycollection": + return "ST_AsText(" + sql + ")", nil + } + + switch field.Type.ID() { + case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: + return "ST_AsText(ST_GeomFromWKB(" + sql + "))", nil + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: + return "CASE WHEN starts_with(trim(" + sql + "), '{') THEN ST_AsText(ST_GeomFromGeoJSON(" + sql + ")) ELSE " + sql + " END", nil + case arrow.STRUCT, arrow.MAP: + return "ST_AsText(ST_GeomFromGeoJSON(to_json(" + sql + ")::VARCHAR))", nil + default: + return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Postgres Geometry without geoarrow/hugr metadata", field.Name, field.Type) + } } func pgRangeValueToSQLValue(v any) (string, error) { diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 77cd789f..06b6d61b 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -24,42 +24,42 @@ type ingestColumn struct { InputDef *ast.FieldDefinition } -func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, reader array.RecordReader) (*QueryPlanNode, error) { +func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, reader array.RecordReader) (*QueryPlanNode, bool, error) { if dataObject == "" { - return nil, fmt.Errorf("missing data object") + return nil, false, fmt.Errorf("missing data object") } if reader == nil { - return nil, fmt.Errorf("missing arrow reader") + return nil, false, fmt.Errorf("missing arrow reader") } info, mutationField, err := resolveIngestTarget(ctx, provider, dataObject) if err != nil { - return nil, err + return nil, false, err } engine, err := planner.Engine(info.Catalog) if err != nil { - return nil, err + return nil, false, err } if caps := engine.Capabilities(); caps == nil || !caps.Insert.Ingest { - return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) + return nil, false, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) } mutation := sdl.MutationInfo(ctx, provider, mutationField) if mutation == nil || mutation.Type != sdl.MutationTypeInsert { - return nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) + return nil, false, fmt.Errorf("data object %q has no insert mutation defined", dataObject) } columns, err := resolveIngestColumns(ctx, provider, info, mutation, reader.Schema()) if err != nil { - return nil, err + return nil, false, err } if len(columns) == 0 { - return nil, fmt.Errorf("no insertable columns matched between arrow stream and data object") + return nil, false, fmt.Errorf("no insertable columns matched between arrow stream and data object") } permissionData, err := checkIngestPermissions(ctx, provider, info, mutationField, columns) if err != nil { - return nil, err + return nil, false, err } - return ingestNode(ctx, info, mutation, engine, columns, permissionData), nil + return ingestNode(ctx, info, mutation, engine, columns, permissionData), ingestRequiresSpatial(columns, reader.Schema()), nil } func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { @@ -238,7 +238,7 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e ObjectDefinition: info.Definition(), } var err error - if caster, ok := engine.(engines.EngineTypeCaster); ok { + if caster, ok := engine.(engines.EngineArrowIngestCaster); ok { value, err = caster.CastArrowIngestValue(field, c.ArrowField, value) } else { value, err = engines.CastArrowIngestValueToDuckDB(field, c.ArrowField, value) @@ -298,3 +298,28 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e }, } } + +func ingestRequiresSpatial(columns []ingestColumn, schema *arrow.Schema) bool { + for _, c := range columns { + if c.FieldDef != nil && c.FieldDef.Type.Name() == base.GeometryTypeName { + return true + } + } + if schema == nil { + return false + } + for _, f := range schema.Fields() { + if ext, ok := f.Metadata.GetValue("ARROW:extension:name"); ok && isIngestGeometryExtension(ext) { + return true + } + if ext, ok := f.Metadata.GetValue("extension:name"); ok && isIngestGeometryExtension(ext) { + return true + } + } + return false +} + +func isIngestGeometryExtension(ext string) bool { + ext = strings.ToLower(ext) + return strings.HasPrefix(ext, "geoarrow.") || ext == "hugr.geojson" || ext == "geojson" +} diff --git a/pkg/planner/plan.go b/pkg/planner/plan.go index a0a0fc5a..0a5e60cd 100644 --- a/pkg/planner/plan.go +++ b/pkg/planner/plan.go @@ -19,6 +19,8 @@ type QueryPlan struct { CompiledQuery string Params []any + + RequiresSpatial bool } func (p *QueryPlan) Compile() error { diff --git a/pkg/planner/planer.go b/pkg/planner/planer.go index be9353c1..14e23b73 100644 --- a/pkg/planner/planer.go +++ b/pkg/planner/planer.go @@ -71,7 +71,7 @@ func (s *Service) Plan(ctx context.Context, provider catalog.Provider, query *as // The Arrow reader is part of this planning API because its schema drives column // resolution and ingest casting, while execution registers it as a temporary view. func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, reader array.RecordReader) (*QueryPlan, error) { - node, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) + node, requiresSpatial, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) if err != nil { return nil, err } @@ -79,5 +79,5 @@ func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider node.engines = s.engines node.querier = s.querier - return &QueryPlan{RootNode: node}, nil + return &QueryPlan{RootNode: node, RequiresSpatial: requiresSpatial}, nil } From 68b38e626575a18166de29b48590d34eeae726f6 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 7 Jun 2026 16:09:52 +0400 Subject: [PATCH 15/36] ipc ingest --- ipc-ingest.go | 2 +- pkg/db/pool.go | 4 +-- pkg/planner/node_arrow_ingest.go | 47 ++++++++------------------------ pkg/planner/plan.go | 2 -- pkg/planner/planer.go | 4 +-- 5 files changed, 16 insertions(+), 43 deletions(-) diff --git a/ipc-ingest.go b/ipc-ingest.go index 4f04774d..7432d44b 100644 --- a/ipc-ingest.go +++ b/ipc-ingest.go @@ -93,7 +93,7 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { writeIngestError(w, http.StatusInternalServerError, "arrow ingest plan produced SQL parameters") return } - res, err := s.db.ExecWithArrowView(ctx, reader, plan.CompiledQuery, plan.RequiresSpatial) + res, err := s.db.ExecWithArrowView(ctx, reader, plan.CompiledQuery) if err != nil { writeIngestError(w, http.StatusInternalServerError, err.Error()) return diff --git a/pkg/db/pool.go b/pkg/db/pool.go index fc85e4ad..1ae7376d 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -229,7 +229,7 @@ func (p *Pool) Arrow(ctx context.Context) (*Arrow, error) { // ExecWithArrowView registers reader as TempArrowViewName and executes query on // the same DuckDB driver connection, where the temporary Arrow view is visible. -func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, query string, requiresSpatial bool) (sql.Result, error) { +func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, query string) (sql.Result, error) { if reader == nil { return nil, fmt.Errorf("missing arrow reader") } @@ -243,7 +243,7 @@ func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, if !ok { return nil, fmt.Errorf("duckdb driver connection does not implement ExecerContext") } - if requiresSpatial || arrowViewNeedsSpatial(reader) { + if arrowViewNeedsSpatial(reader) { if _, err := execer.ExecContext(ctx, "LOAD spatial; CALL register_geoarrow_extensions()", nil); err != nil { return nil, fmt.Errorf("prepare spatial arrow view: %w", err) } diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 06b6d61b..de27feb1 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -24,42 +24,42 @@ type ingestColumn struct { InputDef *ast.FieldDefinition } -func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, reader array.RecordReader) (*QueryPlanNode, bool, error) { +func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, reader array.RecordReader) (*QueryPlanNode, error) { if dataObject == "" { - return nil, false, fmt.Errorf("missing data object") + return nil, fmt.Errorf("missing data object") } if reader == nil { - return nil, false, fmt.Errorf("missing arrow reader") + return nil, fmt.Errorf("missing arrow reader") } info, mutationField, err := resolveIngestTarget(ctx, provider, dataObject) if err != nil { - return nil, false, err + return nil, err } engine, err := planner.Engine(info.Catalog) if err != nil { - return nil, false, err + return nil, err } if caps := engine.Capabilities(); caps == nil || !caps.Insert.Ingest { - return nil, false, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) + return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) } mutation := sdl.MutationInfo(ctx, provider, mutationField) if mutation == nil || mutation.Type != sdl.MutationTypeInsert { - return nil, false, fmt.Errorf("data object %q has no insert mutation defined", dataObject) + return nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) } columns, err := resolveIngestColumns(ctx, provider, info, mutation, reader.Schema()) if err != nil { - return nil, false, err + return nil, err } if len(columns) == 0 { - return nil, false, fmt.Errorf("no insertable columns matched between arrow stream and data object") + return nil, fmt.Errorf("no insertable columns matched between arrow stream and data object") } permissionData, err := checkIngestPermissions(ctx, provider, info, mutationField, columns) if err != nil { - return nil, false, err + return nil, err } - return ingestNode(ctx, info, mutation, engine, columns, permissionData), ingestRequiresSpatial(columns, reader.Schema()), nil + return ingestNode(ctx, info, mutation, engine, columns, permissionData), nil } func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { @@ -298,28 +298,3 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e }, } } - -func ingestRequiresSpatial(columns []ingestColumn, schema *arrow.Schema) bool { - for _, c := range columns { - if c.FieldDef != nil && c.FieldDef.Type.Name() == base.GeometryTypeName { - return true - } - } - if schema == nil { - return false - } - for _, f := range schema.Fields() { - if ext, ok := f.Metadata.GetValue("ARROW:extension:name"); ok && isIngestGeometryExtension(ext) { - return true - } - if ext, ok := f.Metadata.GetValue("extension:name"); ok && isIngestGeometryExtension(ext) { - return true - } - } - return false -} - -func isIngestGeometryExtension(ext string) bool { - ext = strings.ToLower(ext) - return strings.HasPrefix(ext, "geoarrow.") || ext == "hugr.geojson" || ext == "geojson" -} diff --git a/pkg/planner/plan.go b/pkg/planner/plan.go index 0a5e60cd..a0a0fc5a 100644 --- a/pkg/planner/plan.go +++ b/pkg/planner/plan.go @@ -19,8 +19,6 @@ type QueryPlan struct { CompiledQuery string Params []any - - RequiresSpatial bool } func (p *QueryPlan) Compile() error { diff --git a/pkg/planner/planer.go b/pkg/planner/planer.go index 14e23b73..be9353c1 100644 --- a/pkg/planner/planer.go +++ b/pkg/planner/planer.go @@ -71,7 +71,7 @@ func (s *Service) Plan(ctx context.Context, provider catalog.Provider, query *as // The Arrow reader is part of this planning API because its schema drives column // resolution and ingest casting, while execution registers it as a temporary view. func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, reader array.RecordReader) (*QueryPlan, error) { - node, requiresSpatial, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) + node, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) if err != nil { return nil, err } @@ -79,5 +79,5 @@ func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider node.engines = s.engines node.querier = s.querier - return &QueryPlan{RootNode: node, RequiresSpatial: requiresSpatial}, nil + return &QueryPlan{RootNode: node}, nil } From a989c96806bb69c1d03ea4f118325e5747710170 Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 8 Jun 2026 11:40:22 +0400 Subject: [PATCH 16/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 89 +++++++++++++++++-- .../schemas/duck_ingest/schema.graphql | 1 + .../ingest-postgres/docker-compose.yml | 2 +- .../ingest-postgres/ingest_postgres_test.go | 87 +++++++++++++++++- .../ingest-postgres/testdata/init.sql | 5 +- .../testdata/schemas/pg_ingest/schema.graphql | 1 + pkg/db/pool.go | 2 +- pkg/engines/duckdb.go | 8 +- pkg/engines/postgres.go | 43 ++++++--- 9 files changed, 212 insertions(+), 26 deletions(-) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index d631f453..5903903a 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -115,6 +115,7 @@ func setupEnv(t *testing.T) *ingestEnv { seed, err := sql.Open("duckdb", dbPath) require.NoError(t, err) _, err = seed.ExecContext(ctx, ` + INSTALL spatial; LOAD spatial; CREATE SEQUENCE events_id_seq; CREATE TABLE events ( id BIGINT PRIMARY KEY DEFAULT nextval('events_id_seq'), @@ -122,7 +123,8 @@ func setupEnv(t *testing.T) *ingestEnv { value DOUBLE NOT NULL, is_active BOOLEAN NOT NULL DEFAULT true, payload JSON, - created_at TIMESTAMPTZ NOT NULL DEFAULT now() + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + geom GEOMETRY ); `) require.NoError(t, err) @@ -253,10 +255,10 @@ func TestIngest_DuckDB_RoundTrip(t *testing.T) { require.NoError(t, err) defer rows.Close() var ( - gotNames []string - gotValues []float64 - gotActive []bool - gotHasJSON []bool + gotNames []string + gotValues []float64 + gotActive []bool + gotHasJSON []bool ) for rows.Next() { var n string @@ -804,6 +806,51 @@ func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } +func TestIngest_HTTP_GeoArrowPoint_DuckDB(t *testing.T) { + env := setupEnv(t) + + rec, schema := makeGeoArrowPointRecord(t, []string{"geoarrow-a", "geoarrow-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err := http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + var out hugrclient.IngestResult + require.NoError(t, json.Unmarshal(body, &out)) + assert.Equal(t, int64(2), out.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "geom"}, out.Columns) + + ro := env.openRO(t) + defer ro.Close() + _, err = ro.Exec("LOAD spatial") + require.NoError(t, err) + + rows, err := ro.Query("SELECT name, ST_AsText(geom) FROM events WHERE name LIKE 'geoarrow-%' ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]string{} + for rows.Next() { + var name, wkt string + require.NoError(t, rows.Scan(&name, &wkt)) + got[name] = wkt + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]string{ + "geoarrow-a": "POINT (30.5 50.25)", + "geoarrow-b": "POINT (-73.935242 40.73061)", + }, got) +} + // --- helpers -------------------------------------------------------------- type arrowFileFormat int @@ -823,6 +870,38 @@ func eventsArrowSchema() *arrow.Schema { }, nil) } +func makeGeoArrowPointRecord(t *testing.T, names []string, points [][2]float64) (arrow.RecordBatch, *arrow.Schema) { + t.Helper() + require.Len(t, points, len(names)) + + pointType := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, + }, nil) + + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + for i, point := range points { + b.Field(1).(*array.Float64Builder).Append(float64(i + 1)) + b.Field(2).(*array.BooleanBuilder).Append(true) + sb := b.Field(3).(*array.StructBuilder) + sb.Append(true) + sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) + sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + } + + return b.NewRecordBatch(), schema +} + // buildEventsBatch produces one RecordBatch of `rowsPerBatch` rows for the // events schema. Row payload pattern matches the PG bulk fixtures so the // spot-check assertions are reusable. diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql index bdd25b85..fdc5c3ed 100644 --- a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -5,4 +5,5 @@ type events @table(name: "events") { is_active: Boolean! @default(value: "true") payload: JSON created_at: Timestamp @default(value: "now()") + geom: Geometry @geometry_info(srid: 4326, type: POINT) } diff --git a/integration-test/ingest-postgres/docker-compose.yml b/integration-test/ingest-postgres/docker-compose.yml index 83ac0a23..65f6b3ac 100644 --- a/integration-test/ingest-postgres/docker-compose.yml +++ b/integration-test/ingest-postgres/docker-compose.yml @@ -1,6 +1,6 @@ services: postgres: - image: pgvector/pgvector:pg16 + image: postgis/postgis:16-3.4 environment: POSTGRES_DB: ingestdb POSTGRES_USER: test diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 8be37754..ff5b5f4b 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -238,10 +238,10 @@ func TestIngest_Postgres_RoundTrip(t *testing.T) { require.NoError(t, err) defer rows.Close() var ( - gotNames []string - gotValues []float64 - gotActive []bool - gotHasJSON []bool + gotNames []string + gotValues []float64 + gotActive []bool + gotHasJSON []bool ) for rows.Next() { var n string @@ -452,6 +452,38 @@ func eventsArrowSchema() *arrow.Schema { }, nil) } +func makeGeoArrowPointRecord(t *testing.T, names []string, points [][2]float64) (arrow.RecordBatch, *arrow.Schema) { + t.Helper() + require.Len(t, points, len(names)) + + pointType := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, + }, nil) + + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + for i, point := range points { + b.Field(1).(*array.Float64Builder).Append(float64(i + 1)) + b.Field(2).(*array.BooleanBuilder).Append(true) + sb := b.Field(3).(*array.StructBuilder) + sb.Append(true) + sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) + sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + } + + return b.NewRecordBatch(), schema +} + // writeEventsArrowFile produces an Arrow IPC file at path in the given // format with numBatches × rowsPerBatch synthetic events rows. namePrefix is // embedded in the `name` column so different tests can write to the same @@ -1017,6 +1049,53 @@ func TestIngest_HTTP_Direct(t *testing.T) { totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } +func TestIngest_HTTP_GeoArrowPoint(t *testing.T) { + env := setupEnv(t) + + rec, schema := makeGeoArrowPointRecord(t, []string{"geoarrow-a", "geoarrow-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err := http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + var out hugrclient.IngestResult + require.NoError(t, json.Unmarshal(body, &out)) + assert.Equal(t, int64(2), out.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "geom"}, out.Columns) + + rows, err := env.pgConn.Query("SELECT name, ST_AsText(geom), ST_SRID(geom) FROM events WHERE name LIKE 'geoarrow-%' ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]string{} + gotSRID := map[string]int{} + for rows.Next() { + var name, wkt string + var srid int + require.NoError(t, rows.Scan(&name, &wkt, &srid)) + got[name] = wkt + gotSRID[name] = srid + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]string{ + "geoarrow-a": "POINT(30.5 50.25)", + "geoarrow-b": "POINT(-73.935242 40.73061)", + }, got) + assert.Equal(t, map[string]int{ + "geoarrow-a": 4326, + "geoarrow-b": 4326, + }, gotSRID) +} + // lazyEventsReader is an array.RecordReader that generates events-table // RecordBatches on demand. This is the shape of a real-world Arrow producer // (parquet scanner, CDC tap, kafka batcher) — the whole stream is never diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index 07362863..3208680b 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -3,11 +3,14 @@ -- so the tests can also exercise "default value" behaviour (omitting the PK -- from the Arrow stream). +CREATE EXTENSION IF NOT EXISTS postgis; + CREATE TABLE events ( id BIGSERIAL PRIMARY KEY, name VARCHAR NOT NULL, value DOUBLE PRECISION NOT NULL, is_active BOOLEAN NOT NULL DEFAULT true, payload JSONB, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + geom GEOMETRY(Point, 4326) ); diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index bdd25b85..fdc5c3ed 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -5,4 +5,5 @@ type events @table(name: "events") { is_active: Boolean! @default(value: "true") payload: JSON created_at: Timestamp @default(value: "now()") + geom: Geometry @geometry_info(srid: 4326, type: POINT) } diff --git a/pkg/db/pool.go b/pkg/db/pool.go index 1ae7376d..42191d62 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -244,7 +244,7 @@ func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, return nil, fmt.Errorf("duckdb driver connection does not implement ExecerContext") } if arrowViewNeedsSpatial(reader) { - if _, err := execer.ExecContext(ctx, "LOAD spatial; CALL register_geoarrow_extensions()", nil); err != nil { + if _, err := execer.ExecContext(ctx, "LOAD spatial", nil); err != nil { return nil, fmt.Errorf("prepare spatial arrow view: %w", err) } } diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index 7f515b8b..c153c6f7 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -131,7 +131,9 @@ func castArrowGeometryToDuckDB(field arrow.Field, sql string) (string, error) { return "ST_GeomFromText(" + sql + ", true)", nil case "hugr.geojson", "geoarrow.geojson", "geojson": return "ST_GeomFromGeoJSON(" + sql + ")", nil - case "geoarrow.point", "geoarrow.linestring", "geoarrow.polygon", + case "geoarrow.point": + return duckDBGeoArrowPoint(sql), nil + case "geoarrow.linestring", "geoarrow.polygon", "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", "geoarrow.geometry", "geoarrow.geometrycollection": return sql, nil @@ -149,6 +151,10 @@ func castArrowGeometryToDuckDB(field arrow.Field, sql string) (string, error) { } } +func duckDBGeoArrowPoint(sql string) string { + return "ST_Point(struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" +} + func (e *DuckDB) FieldValueByPath(sqlName, path string) string { if path == "" { return sqlName diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 4d7c0f64..ffd6dec8 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -607,36 +607,53 @@ func CastArrowIngestValueToPostgres(field *ast.Field, arrowField arrow.Field, sq case base.JSONTypeName: return CastArrowIngestValueToDuckDB(field, arrowField, sql) case base.GeometryTypeName: - return castArrowGeometryToPostgres(arrowField, sql) + return castArrowGeometryToPostgres(field, arrowField, sql) default: return sql, nil } } -func castArrowGeometryToPostgres(field arrow.Field, sql string) (string, error) { - switch arrowExtensionName(field) { +func castArrowGeometryToPostgres(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { + srid := "" + if field != nil && field.Definition != nil { + srid = base.FieldDefDirectiveArgString(field.Definition, base.FieldGeometryInfoDirectiveName, base.ArgSRID) + } + switch arrowExtensionName(arrowField) { case "geoarrow.wkb": - return "ST_AsText(ST_GeomFromWKB(" + sql + "))", nil + return postgresGeometryText("ST_GeomFromWKB("+sql+")", srid), nil case "geoarrow.wkt": - return sql, nil + return postgresWKTText(sql, srid), nil case "hugr.geojson", "geoarrow.geojson", "geojson": - return "ST_AsText(ST_GeomFromGeoJSON(" + sql + "))", nil - case "geoarrow.point", "geoarrow.linestring", "geoarrow.polygon", + return postgresGeometryText("ST_GeomFromGeoJSON("+sql+")", srid), nil + case "geoarrow.point": + return postgresGeometryText(duckDBGeoArrowPoint(sql), srid), nil + case "geoarrow.linestring", "geoarrow.polygon", "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", "geoarrow.geometry", "geoarrow.geometrycollection": - return "ST_AsText(" + sql + ")", nil + return postgresGeometryText(sql, srid), nil } - switch field.Type.ID() { + switch arrowField.Type.ID() { case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: - return "ST_AsText(ST_GeomFromWKB(" + sql + "))", nil + return postgresGeometryText("ST_GeomFromWKB("+sql+")", srid), nil case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: - return "CASE WHEN starts_with(trim(" + sql + "), '{') THEN ST_AsText(ST_GeomFromGeoJSON(" + sql + ")) ELSE " + sql + " END", nil + return "CASE WHEN starts_with(trim(" + sql + "), '{') THEN " + postgresGeometryText("ST_GeomFromGeoJSON("+sql+")", srid) + " ELSE " + postgresWKTText(sql, srid) + " END", nil case arrow.STRUCT, arrow.MAP: - return "ST_AsText(ST_GeomFromGeoJSON(to_json(" + sql + ")::VARCHAR))", nil + return postgresGeometryText("ST_GeomFromGeoJSON(to_json("+sql+")::VARCHAR)", srid), nil default: - return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Postgres Geometry without geoarrow/hugr metadata", field.Name, field.Type) + return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Postgres Geometry without geoarrow/hugr metadata", arrowField.Name, arrowField.Type) + } +} + +func postgresGeometryText(sql, srid string) string { + return postgresWKTText("ST_AsText("+sql+")", srid) +} + +func postgresWKTText(sql, srid string) string { + if srid == "" || srid == "0" { + return sql } + return "'SRID=" + srid + ";' || " + sql } func pgRangeValueToSQLValue(v any) (string, error) { From e0903482b2fddab4e83af834b9ee43785ce75a9c Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 8 Jun 2026 23:52:35 +0400 Subject: [PATCH 17/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 181 +++++++++++++--- .../schemas/duck_ingest/schema.graphql | 3 + .../ingest-postgres/ingest_postgres_test.go | 194 +++++++++++++++--- .../ingest-postgres/testdata/init.sql | 5 +- .../testdata/schemas/pg_ingest/schema.graphql | 3 + pkg/db/pool.go | 4 + pkg/engines/arrow_ingest.go | 29 +++ pkg/engines/duckdb.go | 17 +- pkg/engines/postgres.go | 3 + 9 files changed, 370 insertions(+), 69 deletions(-) create mode 100644 pkg/engines/arrow_ingest.go diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index 5903903a..4ad7409a 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -15,6 +15,7 @@ import ( "net/http/httptest" "os" "path/filepath" + "strconv" "sync/atomic" "testing" "time" @@ -24,6 +25,8 @@ import ( "github.com/apache/arrow-go/v18/arrow/ipc" "github.com/apache/arrow-go/v18/arrow/memory" _ "github.com/duckdb/duckdb-go/v2" + "github.com/paulmach/orb" + "github.com/paulmach/orb/encoding/wkb" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -124,7 +127,10 @@ func setupEnv(t *testing.T) *ingestEnv { is_active BOOLEAN NOT NULL DEFAULT true, payload JSON, created_at TIMESTAMPTZ NOT NULL DEFAULT now(), - geom GEOMETRY + geom GEOMETRY, + geom_wkt GEOMETRY, + geom_geojson GEOMETRY, + geom_wkb GEOMETRY ); `) require.NoError(t, err) @@ -806,10 +812,10 @@ func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } -func TestIngest_HTTP_GeoArrowPoint_DuckDB(t *testing.T) { +func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { env := setupEnv(t) - rec, schema := makeGeoArrowPointRecord(t, []string{"geoarrow-a", "geoarrow-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + rec, schema := makeGeometryTypesRecord(t, []string{"geo-a", "geo-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) defer rec.Release() var buf bytes.Buffer @@ -827,30 +833,108 @@ func TestIngest_HTTP_GeoArrowPoint_DuckDB(t *testing.T) { var out hugrclient.IngestResult require.NoError(t, json.Unmarshal(body, &out)) assert.Equal(t, int64(2), out.Inserted) - assert.ElementsMatch(t, []string{"name", "value", "is_active", "geom"}, out.Columns) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "geom", "geom_wkt", "geom_geojson", "geom_wkb"}, out.Columns) ro := env.openRO(t) defer ro.Close() _, err = ro.Exec("LOAD spatial") require.NoError(t, err) - rows, err := ro.Query("SELECT name, ST_AsText(geom) FROM events WHERE name LIKE 'geoarrow-%' ORDER BY name") + rows, err := ro.Query(` + SELECT name, ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), ST_AsText(geom_wkb) + FROM events + WHERE name LIKE 'geo-%' + ORDER BY name + `) require.NoError(t, err) defer rows.Close() - got := map[string]string{} + got := map[string][]string{} for rows.Next() { - var name, wkt string - require.NoError(t, rows.Scan(&name, &wkt)) - got[name] = wkt + var name, point, line, polygon, wkbPoint string + require.NoError(t, rows.Scan(&name, &point, &line, &polygon, &wkbPoint)) + got[name] = []string{point, line, polygon, wkbPoint} } require.NoError(t, rows.Err()) - assert.Equal(t, map[string]string{ - "geoarrow-a": "POINT (30.5 50.25)", - "geoarrow-b": "POINT (-73.935242 40.73061)", + assert.Equal(t, map[string][]string{ + "geo-a": []string{"POINT (30.5 50.25)", "LINESTRING (0 0, 1 1, 2 1)", "POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))", "POINT (30.5 50.25)"}, + "geo-b": []string{"POINT (-73.935242 40.73061)", "LINESTRING (1 1, 2 2, 3 2)", "POLYGON ((1 1, 1 2, 2 2, 2 1, 1 1))", "POINT (-73.935242 40.73061)"}, }, got) } +func TestIngest_HTTP_GeometryTypes_Bulk50k_DuckDB(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-geo-bulk" + ) + schema := geometryTypesSchema() + pool := memory.NewGoAllocator() + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + w := ipc.NewWriter(pw, ipc.WithSchema(schema)) + var streamErr error + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rec := buildGeometryTypesBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix) + if err := w.Write(rec); err != nil { + streamErr = fmt.Errorf("write geometry batch %d: %w", batchIdx, err) + rec.Release() + break + } + rec.Release() + } + if err := w.Close(); err != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", err) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + start := time.Now() + resp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", pr) + werr := <-writeErr + require.NoError(t, werr, "writer goroutine failed") + require.NoError(t, postErr) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + var out hugrclient.IngestResult + require.NoError(t, json.Unmarshal(body, &out)) + assert.Equal(t, int64(totalRows), out.Inserted) + + ro := env.openRO(t) + defer ro.Close() + _, err := ro.Exec("LOAD spatial") + require.NoError(t, err) + + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'dk-geo-bulk-%'").Scan(&count)) + assert.Equal(t, totalRows, count) + + var point, line, polygon, wkbPoint string + require.NoError(t, ro.QueryRow(` + SELECT ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), ST_AsText(geom_wkb) + FROM events + WHERE name = 'dk-geo-bulk-049999' + `).Scan(&point, &line, &polygon, &wkbPoint)) + assert.Equal(t, "POINT (99 49)", point) + assert.Equal(t, "LINESTRING (99 49, 100 50, 101 50)", line) + assert.Equal(t, "POLYGON ((99 49, 99 50, 100 50, 100 49, 99 49))", polygon) + assert.Equal(t, "POINT (99 49)", wkbPoint) + + elapsed := time.Since(start) + t.Logf("geometry bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + // --- helpers -------------------------------------------------------------- type arrowFileFormat int @@ -870,36 +954,85 @@ func eventsArrowSchema() *arrow.Schema { }, nil) } -func makeGeoArrowPointRecord(t *testing.T, names []string, points [][2]float64) (arrow.RecordBatch, *arrow.Schema) { +func makeGeometryTypesRecord(t *testing.T, names []string, points [][2]float64) (arrow.RecordBatch, *arrow.Schema) { t.Helper() require.Len(t, points, len(names)) + schema := geometryTypesSchema() + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + for i, name := range names { + appendGeometryTypesRow(b, name, float64(i+1), true, points[i], float64(i), float64(i)) + } + + return b.NewRecordBatch(), schema +} + +func geometryTypesSchema() *arrow.Schema { pointType := arrow.StructOf( arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, ) - schema := arrow.NewSchema([]arrow.Field{ + return arrow.NewSchema([]arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, + {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, + {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, + {Name: "geom_wkb", Type: arrow.BinaryTypes.Binary, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkb"})}, }, nil) +} - pool := memory.NewGoAllocator() +func buildGeometryTypesBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues(names, nil) - for i, point := range points { - b.Field(1).(*array.Float64Builder).Append(float64(i + 1)) - b.Field(2).(*array.BooleanBuilder).Append(true) - sb := b.Field(3).(*array.StructBuilder) - sb.Append(true) - sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) - sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + x := float64(row % 100) + y := float64(row / 1000) + appendGeometryTypesRow(b, fmt.Sprintf("%s-%06d", namePrefix, row), float64(row)*0.5, row%2 == 0, [2]float64{x, y}, x, y) } + return b.NewRecordBatch() +} - return b.NewRecordBatch(), schema +func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, active bool, point [2]float64, shapeX, shapeY float64) { + b.Field(0).(*array.StringBuilder).Append(name) + b.Field(1).(*array.Float64Builder).Append(value) + b.Field(2).(*array.BooleanBuilder).Append(active) + + sb := b.Field(3).(*array.StructBuilder) + sb.Append(true) + sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) + sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + + b.Field(4).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) + b.Field(5).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) + b.Field(6).(*array.BinaryBuilder).Append(wkbPoint) +} + +func lineWKT(x, y float64) string { + return fmt.Sprintf("LINESTRING (%s %s, %s %s, %s %s)", + coord(x), coord(y), + coord(x+1), coord(y+1), + coord(x+2), coord(y+1)) +} + +func polygonGeoJSON(x, y float64) string { + return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, + coord(x), coord(y), + coord(x), coord(y+1), + coord(x+1), coord(y+1), + coord(x+1), coord(y), + coord(x), coord(y)) +} + +func coord(v float64) string { + return strconv.FormatFloat(v, 'f', -1, 64) } // buildEventsBatch produces one RecordBatch of `rowsPerBatch` rows for the diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql index fdc5c3ed..1656d3cb 100644 --- a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -6,4 +6,7 @@ type events @table(name: "events") { payload: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 4326, type: POINT) + geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) + geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) } diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index ff5b5f4b..6ec5c1b8 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -15,6 +15,7 @@ import ( "net/http/httptest" "os" "path/filepath" + "strconv" "sync/atomic" "testing" "time" @@ -24,6 +25,8 @@ import ( "github.com/apache/arrow-go/v18/arrow/ipc" "github.com/apache/arrow-go/v18/arrow/memory" _ "github.com/jackc/pgx/v5/stdlib" + "github.com/paulmach/orb" + "github.com/paulmach/orb/encoding/wkb" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -452,36 +455,85 @@ func eventsArrowSchema() *arrow.Schema { }, nil) } -func makeGeoArrowPointRecord(t *testing.T, names []string, points [][2]float64) (arrow.RecordBatch, *arrow.Schema) { +func makeGeometryTypesRecord(t *testing.T, names []string, points [][2]float64) (arrow.RecordBatch, *arrow.Schema) { t.Helper() require.Len(t, points, len(names)) + schema := geometryTypesSchema() + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + for i, name := range names { + appendGeometryTypesRow(b, name, float64(i+1), true, points[i], float64(i), float64(i)) + } + + return b.NewRecordBatch(), schema +} + +func geometryTypesSchema() *arrow.Schema { pointType := arrow.StructOf( arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, ) - schema := arrow.NewSchema([]arrow.Field{ + return arrow.NewSchema([]arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, + {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, + {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, + {Name: "geom_wkb", Type: arrow.BinaryTypes.Binary, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkb"})}, }, nil) +} - pool := memory.NewGoAllocator() +func buildGeometryTypesBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues(names, nil) - for i, point := range points { - b.Field(1).(*array.Float64Builder).Append(float64(i + 1)) - b.Field(2).(*array.BooleanBuilder).Append(true) - sb := b.Field(3).(*array.StructBuilder) - sb.Append(true) - sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) - sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + x := float64(row % 100) + y := float64(row / 1000) + appendGeometryTypesRow(b, fmt.Sprintf("%s-%06d", namePrefix, row), float64(row)*0.5, row%2 == 0, [2]float64{x, y}, x, y) } + return b.NewRecordBatch() +} - return b.NewRecordBatch(), schema +func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, active bool, point [2]float64, shapeX, shapeY float64) { + b.Field(0).(*array.StringBuilder).Append(name) + b.Field(1).(*array.Float64Builder).Append(value) + b.Field(2).(*array.BooleanBuilder).Append(active) + + sb := b.Field(3).(*array.StructBuilder) + sb.Append(true) + sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) + sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + + b.Field(4).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) + b.Field(5).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) + b.Field(6).(*array.BinaryBuilder).Append(wkbPoint) +} + +func lineWKT(x, y float64) string { + return fmt.Sprintf("LINESTRING (%s %s, %s %s, %s %s)", + coord(x), coord(y), + coord(x+1), coord(y+1), + coord(x+2), coord(y+1)) +} + +func polygonGeoJSON(x, y float64) string { + return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, + coord(x), coord(y), + coord(x), coord(y+1), + coord(x+1), coord(y+1), + coord(x+1), coord(y), + coord(x), coord(y)) +} + +func coord(v float64) string { + return strconv.FormatFloat(v, 'f', -1, 64) } // writeEventsArrowFile produces an Arrow IPC file at path in the given @@ -1049,10 +1101,10 @@ func TestIngest_HTTP_Direct(t *testing.T) { totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } -func TestIngest_HTTP_GeoArrowPoint(t *testing.T) { +func TestIngest_HTTP_GeometryTypes(t *testing.T) { env := setupEnv(t) - rec, schema := makeGeoArrowPointRecord(t, []string{"geoarrow-a", "geoarrow-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + rec, schema := makeGeometryTypesRecord(t, []string{"geo-a", "geo-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) defer rec.Release() var buf bytes.Buffer @@ -1070,32 +1122,114 @@ func TestIngest_HTTP_GeoArrowPoint(t *testing.T) { var out hugrclient.IngestResult require.NoError(t, json.Unmarshal(body, &out)) assert.Equal(t, int64(2), out.Inserted) - assert.ElementsMatch(t, []string{"name", "value", "is_active", "geom"}, out.Columns) - - rows, err := env.pgConn.Query("SELECT name, ST_AsText(geom), ST_SRID(geom) FROM events WHERE name LIKE 'geoarrow-%' ORDER BY name") + assert.ElementsMatch(t, []string{"name", "value", "is_active", "geom", "geom_wkt", "geom_geojson", "geom_wkb"}, out.Columns) + + rows, err := env.pgConn.Query(` + SELECT name, + ST_AsText(geom), ST_SRID(geom), + ST_AsText(geom_wkt), ST_SRID(geom_wkt), + ST_AsText(geom_geojson), ST_SRID(geom_geojson), + ST_AsText(geom_wkb), ST_SRID(geom_wkb) + FROM events + WHERE name LIKE 'geo-%' + ORDER BY name + `) require.NoError(t, err) defer rows.Close() - got := map[string]string{} - gotSRID := map[string]int{} + got := map[string][]string{} + gotSRID := map[string][]int{} for rows.Next() { - var name, wkt string - var srid int - require.NoError(t, rows.Scan(&name, &wkt, &srid)) - got[name] = wkt - gotSRID[name] = srid + var name, point, line, polygon, wkbPoint string + var pointSRID, lineSRID, polygonSRID, wkbPointSRID int + require.NoError(t, rows.Scan(&name, &point, &pointSRID, &line, &lineSRID, &polygon, &polygonSRID, &wkbPoint, &wkbPointSRID)) + got[name] = []string{point, line, polygon, wkbPoint} + gotSRID[name] = []int{pointSRID, lineSRID, polygonSRID, wkbPointSRID} } require.NoError(t, rows.Err()) - assert.Equal(t, map[string]string{ - "geoarrow-a": "POINT(30.5 50.25)", - "geoarrow-b": "POINT(-73.935242 40.73061)", + assert.Equal(t, map[string][]string{ + "geo-a": []string{"POINT(30.5 50.25)", "LINESTRING(0 0,1 1,2 1)", "POLYGON((0 0,0 1,1 1,1 0,0 0))", "POINT(30.5 50.25)"}, + "geo-b": []string{"POINT(-73.935242 40.73061)", "LINESTRING(1 1,2 2,3 2)", "POLYGON((1 1,1 2,2 2,2 1,1 1))", "POINT(-73.935242 40.73061)"}, }, got) - assert.Equal(t, map[string]int{ - "geoarrow-a": 4326, - "geoarrow-b": 4326, + assert.Equal(t, map[string][]int{ + "geo-a": []int{4326, 4326, 4326, 4326}, + "geo-b": []int{4326, 4326, 4326, 4326}, }, gotSRID) } +func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "pg-geo-bulk" + ) + schema := geometryTypesSchema() + pool := memory.NewGoAllocator() + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + w := ipc.NewWriter(pw, ipc.WithSchema(schema)) + var streamErr error + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rec := buildGeometryTypesBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix) + if err := w.Write(rec); err != nil { + streamErr = fmt.Errorf("write geometry batch %d: %w", batchIdx, err) + rec.Release() + break + } + rec.Release() + } + if err := w.Close(); err != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", err) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + start := time.Now() + resp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", pr) + werr := <-writeErr + require.NoError(t, werr, "writer goroutine failed") + require.NoError(t, postErr) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + var out hugrclient.IngestResult + require.NoError(t, json.Unmarshal(body, &out)) + assert.Equal(t, int64(totalRows), out.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'pg-geo-bulk-%'").Scan(&count)) + assert.Equal(t, totalRows, count) + + var point, line, polygon, wkbPoint string + var pointSRID, lineSRID, polygonSRID, wkbPointSRID int + require.NoError(t, env.pgConn.QueryRow(` + SELECT ST_AsText(geom), ST_SRID(geom), + ST_AsText(geom_wkt), ST_SRID(geom_wkt), + ST_AsText(geom_geojson), ST_SRID(geom_geojson), + ST_AsText(geom_wkb), ST_SRID(geom_wkb) + FROM events + WHERE name = 'pg-geo-bulk-049999' + `).Scan(&point, &pointSRID, &line, &lineSRID, &polygon, &polygonSRID, &wkbPoint, &wkbPointSRID)) + assert.Equal(t, "POINT(99 49)", point) + assert.Equal(t, "LINESTRING(99 49,100 50,101 50)", line) + assert.Equal(t, "POLYGON((99 49,99 50,100 50,100 49,99 49))", polygon) + assert.Equal(t, "POINT(99 49)", wkbPoint) + assert.Equal(t, []int{4326, 4326, 4326, 4326}, []int{pointSRID, lineSRID, polygonSRID, wkbPointSRID}) + + elapsed := time.Since(start) + t.Logf("geometry bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + // lazyEventsReader is an array.RecordReader that generates events-table // RecordBatches on demand. This is the shape of a real-world Arrow producer // (parquet scanner, CDC tap, kafka batcher) — the whole stream is never diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index 3208680b..c2733297 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -12,5 +12,8 @@ CREATE TABLE events ( is_active BOOLEAN NOT NULL DEFAULT true, payload JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - geom GEOMETRY(Point, 4326) + geom GEOMETRY(Point, 4326), + geom_wkt GEOMETRY(LineString, 4326), + geom_geojson GEOMETRY(Polygon, 4326), + geom_wkb GEOMETRY(Point, 4326) ); diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index fdc5c3ed..1656d3cb 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -6,4 +6,7 @@ type events @table(name: "events") { payload: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 4326, type: POINT) + geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) + geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) } diff --git a/pkg/db/pool.go b/pkg/db/pool.go index 42191d62..cfdd2830 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -8,6 +8,7 @@ import ( "strings" "sync" + "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" "github.com/duckdb/duckdb-go/v2" ) @@ -262,6 +263,9 @@ func arrowViewNeedsSpatial(reader array.RecordReader) bool { return false } for _, f := range reader.Schema().Fields() { + if extType, ok := f.Type.(arrow.ExtensionType); ok && isGeometryArrowExtension(extType.ExtensionName()) { + return true + } if ext, ok := f.Metadata.GetValue("ARROW:extension:name"); ok && isGeometryArrowExtension(ext) { return true } diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go new file mode 100644 index 00000000..64fed466 --- /dev/null +++ b/pkg/engines/arrow_ingest.go @@ -0,0 +1,29 @@ +package engines + +import ( + "strings" + + "github.com/apache/arrow-go/v18/arrow" +) + +func arrowExtensionName(field arrow.Field) string { + if extType, ok := field.Type.(arrow.ExtensionType); ok { + return strings.ToLower(extType.ExtensionName()) + } + if ext, ok := field.Metadata.GetValue("ARROW:extension:name"); ok { + return strings.ToLower(ext) + } + if ext, ok := field.Metadata.GetValue("extension:name"); ok { + return strings.ToLower(ext) + } + return "" +} + +func arrowFieldIsExtensionType(field arrow.Field) bool { + _, ok := field.Type.(arrow.ExtensionType) + return ok +} + +func duckDBGeoArrowPoint(sql string) string { + return "ST_Point(struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" +} diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index c153c6f7..cf5fca1e 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -113,19 +113,12 @@ func CastArrowIngestValueToDuckDB(field *ast.Field, arrowField arrow.Field, sql } } -func arrowExtensionName(field arrow.Field) string { - if ext, ok := field.Metadata.GetValue("ARROW:extension:name"); ok { - return strings.ToLower(ext) - } - if ext, ok := field.Metadata.GetValue("extension:name"); ok { - return strings.ToLower(ext) - } - return "" -} - func castArrowGeometryToDuckDB(field arrow.Field, sql string) (string, error) { switch arrowExtensionName(field) { case "geoarrow.wkb": + if arrowFieldIsExtensionType(field) { + return sql, nil + } return "ST_GeomFromWKB(" + sql + ")", nil case "geoarrow.wkt": return "ST_GeomFromText(" + sql + ", true)", nil @@ -151,10 +144,6 @@ func castArrowGeometryToDuckDB(field arrow.Field, sql string) (string, error) { } } -func duckDBGeoArrowPoint(sql string) string { - return "ST_Point(struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" -} - func (e *DuckDB) FieldValueByPath(sqlName, path string) string { if path == "" { return sqlName diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index ffd6dec8..00dc3edc 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -620,6 +620,9 @@ func castArrowGeometryToPostgres(field *ast.Field, arrowField arrow.Field, sql s } switch arrowExtensionName(arrowField) { case "geoarrow.wkb": + if arrowFieldIsExtensionType(arrowField) { + return postgresGeometryText(sql, srid), nil + } return postgresGeometryText("ST_GeomFromWKB("+sql+")", srid), nil case "geoarrow.wkt": return postgresWKTText(sql, srid), nil From 4fc8a4a633a2f249bcd85ac8b29db09fb562b43c Mon Sep 17 00:00:00 2001 From: vadim Date: Mon, 8 Jun 2026 23:57:23 +0400 Subject: [PATCH 18/36] ipc ingest --- pkg/planner/plan.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/planner/plan.go b/pkg/planner/plan.go index a0a0fc5a..d04c0979 100644 --- a/pkg/planner/plan.go +++ b/pkg/planner/plan.go @@ -4,11 +4,11 @@ import ( "context" "errors" - "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" "github.com/hugr-lab/query-engine/pkg/db" "github.com/hugr-lab/query-engine/pkg/engines" + "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/types" "github.com/vektah/gqlparser/v2/ast" ) @@ -42,6 +42,7 @@ func (p *QueryPlan) Execute(ctx context.Context, db *db.Pool) (data interface{}, return nil, err } } + switch { case sdl.IsScalarType(p.Query.Definition.Type.Name()) && p.Query.Definition.Type.NamedType == "": From 5c0b0d1d15f568fd0ae72c3eb5d65fd82f13a832 Mon Sep 17 00:00:00 2001 From: vadim Date: Tue, 9 Jun 2026 00:42:55 +0400 Subject: [PATCH 19/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 157 ++++++++++++++-- .../schemas/duck_ingest/schema.graphql | 5 + .../ingest-postgres/ingest_postgres_test.go | 171 ++++++++++++++++-- .../ingest-postgres/testdata/init.sql | 7 +- .../testdata/schemas/pg_ingest/schema.graphql | 5 + pkg/engines/airport.go | 1 + pkg/engines/arrow_ingest.go | 83 ++++++++- pkg/engines/arrow_ingest_test.go | 158 ++++++++++++++++ pkg/engines/duckdb.go | 19 +- pkg/engines/ducklake.go | 1 + pkg/engines/engines.go | 6 + pkg/engines/postgres.go | 35 +++- pkg/planner/node_arrow_ingest.go | 43 +++-- 13 files changed, 618 insertions(+), 73 deletions(-) create mode 100644 pkg/engines/arrow_ingest_test.go diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index 4ad7409a..a78a6045 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -16,6 +16,7 @@ import ( "os" "path/filepath" "strconv" + "strings" "sync/atomic" "testing" "time" @@ -130,7 +131,12 @@ func setupEnv(t *testing.T) *ingestEnv { geom GEOMETRY, geom_wkt GEOMETRY, geom_geojson GEOMETRY, - geom_wkb GEOMETRY + geom_wkb GEOMETRY, + geom_line GEOMETRY, + geom_polygon_native GEOMETRY, + geom_multipoint GEOMETRY, + geom_multiline GEOMETRY, + geom_multipolygon GEOMETRY ); `) require.NoError(t, err) @@ -833,7 +839,7 @@ func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { var out hugrclient.IngestResult require.NoError(t, json.Unmarshal(body, &out)) assert.Equal(t, int64(2), out.Inserted) - assert.ElementsMatch(t, []string{"name", "value", "is_active", "geom", "geom_wkt", "geom_geojson", "geom_wkb"}, out.Columns) + assert.ElementsMatch(t, geometryTypesColumns(), out.Columns) ro := env.openRO(t) defer ro.Close() @@ -841,7 +847,10 @@ func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { require.NoError(t, err) rows, err := ro.Query(` - SELECT name, ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), ST_AsText(geom_wkb) + SELECT name, + ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), ST_AsText(geom_wkb), + ST_AsText(geom_line), ST_AsText(geom_polygon_native), ST_AsText(geom_multipoint), + ST_AsText(geom_multiline), ST_AsText(geom_multipolygon) FROM events WHERE name LIKE 'geo-%' ORDER BY name @@ -851,14 +860,22 @@ func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { got := map[string][]string{} for rows.Next() { - var name, point, line, polygon, wkbPoint string - require.NoError(t, rows.Scan(&name, &point, &line, &polygon, &wkbPoint)) - got[name] = []string{point, line, polygon, wkbPoint} + var name string + values := make([]string, 9) + scanArgs := []any{&name} + for i := range values { + scanArgs = append(scanArgs, &values[i]) + } + require.NoError(t, rows.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + got[name] = values } require.NoError(t, rows.Err()) assert.Equal(t, map[string][]string{ - "geo-a": []string{"POINT (30.5 50.25)", "LINESTRING (0 0, 1 1, 2 1)", "POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))", "POINT (30.5 50.25)"}, - "geo-b": []string{"POINT (-73.935242 40.73061)", "LINESTRING (1 1, 2 2, 3 2)", "POLYGON ((1 1, 1 2, 2 2, 2 1, 1 1))", "POINT (-73.935242 40.73061)"}, + "geo-a": geometryExpected("POINT(30.5 50.25)", "0", "0"), + "geo-b": geometryExpected("POINT(-73.935242 40.73061)", "1", "1"), }, got) } @@ -919,16 +936,18 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k_DuckDB(t *testing.T) { require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'dk-geo-bulk-%'").Scan(&count)) assert.Equal(t, totalRows, count) - var point, line, polygon, wkbPoint string + values := make([]string, 9) require.NoError(t, ro.QueryRow(` - SELECT ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), ST_AsText(geom_wkb) + SELECT ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), ST_AsText(geom_wkb), + ST_AsText(geom_line), ST_AsText(geom_polygon_native), ST_AsText(geom_multipoint), + ST_AsText(geom_multiline), ST_AsText(geom_multipolygon) FROM events WHERE name = 'dk-geo-bulk-049999' - `).Scan(&point, &line, &polygon, &wkbPoint)) - assert.Equal(t, "POINT (99 49)", point) - assert.Equal(t, "LINESTRING (99 49, 100 50, 101 50)", line) - assert.Equal(t, "POLYGON ((99 49, 99 50, 100 50, 100 49, 99 49))", polygon) - assert.Equal(t, "POINT (99 49)", wkbPoint) + `).Scan(&values[0], &values[1], &values[2], &values[3], &values[4], &values[5], &values[6], &values[7], &values[8])) + for i := range values { + values[i] = compactWKT(values[i]) + } + assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) elapsed := time.Since(start) t.Logf("geometry bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", @@ -975,6 +994,8 @@ func geometryTypesSchema() *arrow.Schema { arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, ) + lineType := arrow.ListOf(pointType) + polygonType := arrow.ListOf(lineType) return arrow.NewSchema([]arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, @@ -983,9 +1004,55 @@ func geometryTypesSchema() *arrow.Schema { {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, {Name: "geom_wkb", Type: arrow.BinaryTypes.Binary, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkb"})}, + {Name: "geom_line", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.linestring"})}, + {Name: "geom_polygon_native", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.polygon"})}, + {Name: "geom_multipoint", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multipoint"})}, + {Name: "geom_multiline", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multilinestring"})}, + {Name: "geom_multipolygon", Type: arrow.ListOf(polygonType), Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multipolygon"})}, }, nil) } +func geometryTypesColumns() []string { + return []string{ + "name", "value", "is_active", + "geom", "geom_wkt", "geom_geojson", "geom_wkb", + "geom_line", "geom_polygon_native", "geom_multipoint", + "geom_multiline", "geom_multipolygon", + } +} + +func geometryExpected(point, x, y string) []string { + return []string{ + point, + fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), + fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + point, + fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), + fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y), + fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)), + fmt.Sprintf("MULTIPOLYGON(((%s %s,%s %s,%s %s,%s %s,%s %s)))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + } +} + +func addCoord(v string, delta float64) string { + f, err := strconv.ParseFloat(v, 64) + if err != nil { + panic(err) + } + return coord(f + delta) +} + +func compactWKT(s string) string { + s = strings.ReplaceAll(s, ", ", ",") + s = strings.ReplaceAll(s, " (", "(") + if strings.HasPrefix(s, "MULTIPOINT((") && strings.HasSuffix(s, "))") { + inner := strings.TrimSuffix(strings.TrimPrefix(s, "MULTIPOINT(("), "))") + s = "MULTIPOINT(" + strings.ReplaceAll(inner, "),(", ",") + ")" + } + return s +} + func buildGeometryTypesBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { b := array.NewRecordBuilder(pool, schema) defer b.Release() @@ -1013,6 +1080,66 @@ func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, b.Field(5).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) b.Field(6).(*array.BinaryBuilder).Append(wkbPoint) + appendPointList(b.Field(7).(*array.ListBuilder), linePoints(shapeX, shapeY)) + appendPointListList(b.Field(8).(*array.ListBuilder), polygonRings(shapeX, shapeY)) + appendPointList(b.Field(9).(*array.ListBuilder), multiPoints(shapeX, shapeY)) + appendPointListList(b.Field(10).(*array.ListBuilder), multiLines(shapeX, shapeY)) + appendPointListListList(b.Field(11).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) +} + +type xyPoint [2]float64 + +func appendPoint(sb *array.StructBuilder, point xyPoint) { + sb.Append(true) + sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) + sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) +} + +func appendPointList(lb *array.ListBuilder, points []xyPoint) { + lb.Append(true) + sb := lb.ValueBuilder().(*array.StructBuilder) + for _, point := range points { + appendPoint(sb, point) + } +} + +func appendPointListList(lb *array.ListBuilder, lines [][]xyPoint) { + lb.Append(true) + inner := lb.ValueBuilder().(*array.ListBuilder) + for _, points := range lines { + appendPointList(inner, points) + } +} + +func appendPointListListList(lb *array.ListBuilder, polygons [][][]xyPoint) { + lb.Append(true) + inner := lb.ValueBuilder().(*array.ListBuilder) + for _, rings := range polygons { + appendPointListList(inner, rings) + } +} + +func linePoints(x, y float64) []xyPoint { + return []xyPoint{{x, y}, {x + 1, y + 1}, {x + 2, y + 1}} +} + +func polygonRings(x, y float64) [][]xyPoint { + return [][]xyPoint{{{x, y}, {x, y + 1}, {x + 1, y + 1}, {x + 1, y}, {x, y}}} +} + +func multiPoints(x, y float64) []xyPoint { + return []xyPoint{{x, y}, {x + 1, y + 1}, {x + 2, y}} +} + +func multiLines(x, y float64) [][]xyPoint { + return [][]xyPoint{ + {{x, y}, {x + 1, y + 1}}, + {{x + 2, y + 2}, {x + 3, y + 3}}, + } +} + +func multiPolygons(x, y float64) [][][]xyPoint { + return [][][]xyPoint{polygonRings(x, y)} } func lineWKT(x, y float64) string { diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql index 1656d3cb..71eb7262 100644 --- a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -9,4 +9,9 @@ type events @table(name: "events") { geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) + geom_line: Geometry @geometry_info(srid: 4326, type: LINESTRING) + geom_polygon_native: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_multipoint: Geometry @geometry_info(srid: 4326, type: MULTIPOINT) + geom_multiline: Geometry @geometry_info(srid: 4326, type: MULTILINESTRING) + geom_multipolygon: Geometry @geometry_info(srid: 4326, type: MULTIPOLYGON) } diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 6ec5c1b8..a59272c6 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -16,6 +16,7 @@ import ( "os" "path/filepath" "strconv" + "strings" "sync/atomic" "testing" "time" @@ -476,6 +477,8 @@ func geometryTypesSchema() *arrow.Schema { arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, ) + lineType := arrow.ListOf(pointType) + polygonType := arrow.ListOf(lineType) return arrow.NewSchema([]arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, @@ -484,9 +487,55 @@ func geometryTypesSchema() *arrow.Schema { {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, {Name: "geom_wkb", Type: arrow.BinaryTypes.Binary, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkb"})}, + {Name: "geom_line", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.linestring"})}, + {Name: "geom_polygon_native", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.polygon"})}, + {Name: "geom_multipoint", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multipoint"})}, + {Name: "geom_multiline", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multilinestring"})}, + {Name: "geom_multipolygon", Type: arrow.ListOf(polygonType), Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multipolygon"})}, }, nil) } +func geometryTypesColumns() []string { + return []string{ + "name", "value", "is_active", + "geom", "geom_wkt", "geom_geojson", "geom_wkb", + "geom_line", "geom_polygon_native", "geom_multipoint", + "geom_multiline", "geom_multipolygon", + } +} + +func geometryExpected(point, x, y string) []string { + return []string{ + point, + fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), + fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + point, + fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), + fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y), + fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)), + fmt.Sprintf("MULTIPOLYGON(((%s %s,%s %s,%s %s,%s %s,%s %s)))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + } +} + +func addCoord(v string, delta float64) string { + f, err := strconv.ParseFloat(v, 64) + if err != nil { + panic(err) + } + return coord(f + delta) +} + +func compactWKT(s string) string { + s = strings.ReplaceAll(s, ", ", ",") + s = strings.ReplaceAll(s, " (", "(") + if strings.HasPrefix(s, "MULTIPOINT((") && strings.HasSuffix(s, "))") { + inner := strings.TrimSuffix(strings.TrimPrefix(s, "MULTIPOINT(("), "))") + s = "MULTIPOINT(" + strings.ReplaceAll(inner, "),(", ",") + ")" + } + return s +} + func buildGeometryTypesBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { b := array.NewRecordBuilder(pool, schema) defer b.Release() @@ -514,6 +563,66 @@ func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, b.Field(5).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) b.Field(6).(*array.BinaryBuilder).Append(wkbPoint) + appendPointList(b.Field(7).(*array.ListBuilder), linePoints(shapeX, shapeY)) + appendPointListList(b.Field(8).(*array.ListBuilder), polygonRings(shapeX, shapeY)) + appendPointList(b.Field(9).(*array.ListBuilder), multiPoints(shapeX, shapeY)) + appendPointListList(b.Field(10).(*array.ListBuilder), multiLines(shapeX, shapeY)) + appendPointListListList(b.Field(11).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) +} + +type xyPoint [2]float64 + +func appendPoint(sb *array.StructBuilder, point xyPoint) { + sb.Append(true) + sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) + sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) +} + +func appendPointList(lb *array.ListBuilder, points []xyPoint) { + lb.Append(true) + sb := lb.ValueBuilder().(*array.StructBuilder) + for _, point := range points { + appendPoint(sb, point) + } +} + +func appendPointListList(lb *array.ListBuilder, lines [][]xyPoint) { + lb.Append(true) + inner := lb.ValueBuilder().(*array.ListBuilder) + for _, points := range lines { + appendPointList(inner, points) + } +} + +func appendPointListListList(lb *array.ListBuilder, polygons [][][]xyPoint) { + lb.Append(true) + inner := lb.ValueBuilder().(*array.ListBuilder) + for _, rings := range polygons { + appendPointListList(inner, rings) + } +} + +func linePoints(x, y float64) []xyPoint { + return []xyPoint{{x, y}, {x + 1, y + 1}, {x + 2, y + 1}} +} + +func polygonRings(x, y float64) [][]xyPoint { + return [][]xyPoint{{{x, y}, {x, y + 1}, {x + 1, y + 1}, {x + 1, y}, {x, y}}} +} + +func multiPoints(x, y float64) []xyPoint { + return []xyPoint{{x, y}, {x + 1, y + 1}, {x + 2, y}} +} + +func multiLines(x, y float64) [][]xyPoint { + return [][]xyPoint{ + {{x, y}, {x + 1, y + 1}}, + {{x + 2, y + 2}, {x + 3, y + 3}}, + } +} + +func multiPolygons(x, y float64) [][][]xyPoint { + return [][][]xyPoint{polygonRings(x, y)} } func lineWKT(x, y float64) string { @@ -1122,14 +1231,19 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { var out hugrclient.IngestResult require.NoError(t, json.Unmarshal(body, &out)) assert.Equal(t, int64(2), out.Inserted) - assert.ElementsMatch(t, []string{"name", "value", "is_active", "geom", "geom_wkt", "geom_geojson", "geom_wkb"}, out.Columns) + assert.ElementsMatch(t, geometryTypesColumns(), out.Columns) rows, err := env.pgConn.Query(` SELECT name, ST_AsText(geom), ST_SRID(geom), ST_AsText(geom_wkt), ST_SRID(geom_wkt), ST_AsText(geom_geojson), ST_SRID(geom_geojson), - ST_AsText(geom_wkb), ST_SRID(geom_wkb) + ST_AsText(geom_wkb), ST_SRID(geom_wkb), + ST_AsText(geom_line), ST_SRID(geom_line), + ST_AsText(geom_polygon_native), ST_SRID(geom_polygon_native), + ST_AsText(geom_multipoint), ST_SRID(geom_multipoint), + ST_AsText(geom_multiline), ST_SRID(geom_multiline), + ST_AsText(geom_multipolygon), ST_SRID(geom_multipolygon) FROM events WHERE name LIKE 'geo-%' ORDER BY name @@ -1140,20 +1254,28 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { got := map[string][]string{} gotSRID := map[string][]int{} for rows.Next() { - var name, point, line, polygon, wkbPoint string - var pointSRID, lineSRID, polygonSRID, wkbPointSRID int - require.NoError(t, rows.Scan(&name, &point, &pointSRID, &line, &lineSRID, &polygon, &polygonSRID, &wkbPoint, &wkbPointSRID)) - got[name] = []string{point, line, polygon, wkbPoint} - gotSRID[name] = []int{pointSRID, lineSRID, polygonSRID, wkbPointSRID} + var name string + values := make([]string, 9) + srids := make([]int, 9) + scanArgs := []any{&name} + for i := range values { + scanArgs = append(scanArgs, &values[i], &srids[i]) + } + require.NoError(t, rows.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + got[name] = values + gotSRID[name] = srids } require.NoError(t, rows.Err()) assert.Equal(t, map[string][]string{ - "geo-a": []string{"POINT(30.5 50.25)", "LINESTRING(0 0,1 1,2 1)", "POLYGON((0 0,0 1,1 1,1 0,0 0))", "POINT(30.5 50.25)"}, - "geo-b": []string{"POINT(-73.935242 40.73061)", "LINESTRING(1 1,2 2,3 2)", "POLYGON((1 1,1 2,2 2,2 1,1 1))", "POINT(-73.935242 40.73061)"}, + "geo-a": geometryExpected("POINT(30.5 50.25)", "0", "0"), + "geo-b": geometryExpected("POINT(-73.935242 40.73061)", "1", "1"), }, got) assert.Equal(t, map[string][]int{ - "geo-a": []int{4326, 4326, 4326, 4326}, - "geo-b": []int{4326, 4326, 4326, 4326}, + "geo-a": []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, + "geo-b": []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, }, gotSRID) } @@ -1209,21 +1331,30 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'pg-geo-bulk-%'").Scan(&count)) assert.Equal(t, totalRows, count) - var point, line, polygon, wkbPoint string - var pointSRID, lineSRID, polygonSRID, wkbPointSRID int + values := make([]string, 9) + srids := make([]int, 9) require.NoError(t, env.pgConn.QueryRow(` SELECT ST_AsText(geom), ST_SRID(geom), ST_AsText(geom_wkt), ST_SRID(geom_wkt), ST_AsText(geom_geojson), ST_SRID(geom_geojson), - ST_AsText(geom_wkb), ST_SRID(geom_wkb) + ST_AsText(geom_wkb), ST_SRID(geom_wkb), + ST_AsText(geom_line), ST_SRID(geom_line), + ST_AsText(geom_polygon_native), ST_SRID(geom_polygon_native), + ST_AsText(geom_multipoint), ST_SRID(geom_multipoint), + ST_AsText(geom_multiline), ST_SRID(geom_multiline), + ST_AsText(geom_multipolygon), ST_SRID(geom_multipolygon) FROM events WHERE name = 'pg-geo-bulk-049999' - `).Scan(&point, &pointSRID, &line, &lineSRID, &polygon, &polygonSRID, &wkbPoint, &wkbPointSRID)) - assert.Equal(t, "POINT(99 49)", point) - assert.Equal(t, "LINESTRING(99 49,100 50,101 50)", line) - assert.Equal(t, "POLYGON((99 49,99 50,100 50,100 49,99 49))", polygon) - assert.Equal(t, "POINT(99 49)", wkbPoint) - assert.Equal(t, []int{4326, 4326, 4326, 4326}, []int{pointSRID, lineSRID, polygonSRID, wkbPointSRID}) + `).Scan( + &values[0], &srids[0], &values[1], &srids[1], &values[2], &srids[2], + &values[3], &srids[3], &values[4], &srids[4], &values[5], &srids[5], + &values[6], &srids[6], &values[7], &srids[7], &values[8], &srids[8], + )) + for i := range values { + values[i] = compactWKT(values[i]) + } + assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) + assert.Equal(t, []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, srids) elapsed := time.Since(start) t.Logf("geometry bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index c2733297..db56f54b 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -15,5 +15,10 @@ CREATE TABLE events ( geom GEOMETRY(Point, 4326), geom_wkt GEOMETRY(LineString, 4326), geom_geojson GEOMETRY(Polygon, 4326), - geom_wkb GEOMETRY(Point, 4326) + geom_wkb GEOMETRY(Point, 4326), + geom_line GEOMETRY(LineString, 4326), + geom_polygon_native GEOMETRY(Polygon, 4326), + geom_multipoint GEOMETRY(MultiPoint, 4326), + geom_multiline GEOMETRY(MultiLineString, 4326), + geom_multipolygon GEOMETRY(MultiPolygon, 4326) ); diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index 1656d3cb..71eb7262 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -9,4 +9,9 @@ type events @table(name: "events") { geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) + geom_line: Geometry @geometry_info(srid: 4326, type: LINESTRING) + geom_polygon_native: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_multipoint: Geometry @geometry_info(srid: 4326, type: MULTIPOINT) + geom_multiline: Geometry @geometry_info(srid: 4326, type: MULTILINESTRING) + geom_multipolygon: Geometry @geometry_info(srid: 4326, type: MULTIPOLYGON) } diff --git a/pkg/engines/airport.go b/pkg/engines/airport.go index bf80f4b3..a8ea22ac 100644 --- a/pkg/engines/airport.go +++ b/pkg/engines/airport.go @@ -26,6 +26,7 @@ func (e *AirportEngine) Type() Type { func (e *AirportEngine) Capabilities() *compiler.EngineCapabilities { cap := e.DuckDB.Capabilities() cap.General.SupportDefaultSequences = false + cap.Insert.Ingest = false return cap } diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index 64fed466..0495229c 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -1,11 +1,32 @@ package engines import ( + "fmt" "strings" "github.com/apache/arrow-go/v18/arrow" ) +// ArrowIngestStagingBuilder builds SQL fragments evaluated by DuckDB while an +// Arrow reader is registered as a temporary view. Target engines still decide +// how Arrow columns are shaped, but default/auth expression functions must be +// valid in this DuckDB staging SELECT. +type ArrowIngestStagingBuilder struct { + duckdb DuckDB +} + +func NewArrowIngestStagingBuilder() *ArrowIngestStagingBuilder { + return &ArrowIngestStagingBuilder{} +} + +func (b *ArrowIngestStagingBuilder) SQLValue(v any) (string, error) { + return b.duckdb.SQLValue(v) +} + +func (b *ArrowIngestStagingBuilder) FunctionCall(name string, positional []any, named map[string]any) (string, error) { + return b.duckdb.FunctionCall(name, positional, named) +} + func arrowExtensionName(field arrow.Field) string { if extType, ok := field.Type.(arrow.ExtensionType); ok { return strings.ToLower(extType.ExtensionName()) @@ -19,11 +40,63 @@ func arrowExtensionName(field arrow.Field) string { return "" } -func arrowFieldIsExtensionType(field arrow.Field) bool { - _, ok := field.Type.(arrow.ExtensionType) - return ok +func duckDBGeoArrowPointCoords(sql string) string { + return "format('{} {}', struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" +} + +func duckDBGeoArrowPointWKT(sql string) string { + return "'POINT (' || " + duckDBGeoArrowPointCoords(sql) + " || ')'" } -func duckDBGeoArrowPoint(sql string) string { - return "ST_Point(struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" +func duckDBGeoArrowPointListCoords(sql string) string { + return "array_to_string(list_transform(" + sql + ", lambda _p: " + duckDBGeoArrowPointCoords("_p") + "), ', ')" +} + +func duckDBGeoArrowLineStringWKT(sql string) string { + return "'LINESTRING (' || " + duckDBGeoArrowPointListCoords(sql) + " || ')'" +} + +func duckDBGeoArrowRingWKT(sql string) string { + return "'(' || " + duckDBGeoArrowPointListCoords(sql) + " || ')'" +} + +func duckDBGeoArrowPolygonWKT(sql string) string { + return "'POLYGON (' || array_to_string(list_transform(" + sql + ", lambda _r: " + + duckDBGeoArrowRingWKT("_r") + "), ', ') || ')'" +} + +func duckDBGeoArrowMultiPointWKT(sql string) string { + return "'MULTIPOINT (' || " + duckDBGeoArrowPointListCoords(sql) + " || ')'" +} + +func duckDBGeoArrowMultiLineStringWKT(sql string) string { + return "'MULTILINESTRING (' || array_to_string(list_transform(" + sql + ", lambda _ls: " + + duckDBGeoArrowRingWKT("_ls") + "), ', ') || ')'" +} + +func duckDBGeoArrowMultiPolygonWKT(sql string) string { + return "'MULTIPOLYGON (' || array_to_string(list_transform(" + sql + ", lambda _poly: '(' || " + + "array_to_string(list_transform(_poly, lambda _r: " + duckDBGeoArrowRingWKT("_r") + + "), ', ') || ')'), ', ') || ')'" +} + +func duckDBGeoArrowNativeWKT(ext, sql string) (string, error) { + switch ext { + case "geoarrow.point": + return duckDBGeoArrowPointWKT(sql), nil + case "geoarrow.linestring": + return duckDBGeoArrowLineStringWKT(sql), nil + case "geoarrow.polygon": + return duckDBGeoArrowPolygonWKT(sql), nil + case "geoarrow.multipoint": + return duckDBGeoArrowMultiPointWKT(sql), nil + case "geoarrow.multilinestring": + return duckDBGeoArrowMultiLineStringWKT(sql), nil + case "geoarrow.multipolygon": + return duckDBGeoArrowMultiPolygonWKT(sql), nil + case "geoarrow.geometry", "geoarrow.geometrycollection": + return "", fmt.Errorf("%s ingest is not supported from native union storage; send geoarrow.wkb, geoarrow.wkt, geoarrow.geojson, or a concrete GeoArrow coordinate layout", ext) + default: + return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) + } } diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go new file mode 100644 index 00000000..d62ed654 --- /dev/null +++ b/pkg/engines/arrow_ingest_test.go @@ -0,0 +1,158 @@ +package engines + +import ( + "strings" + "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" + "github.com/paulmach/orb" + "github.com/vektah/gqlparser/v2/ast" +) + +func TestDuckDBArrowIngestCastsNativeGeoArrowExplicitly(t *testing.T) { + field := geometryTestField("") + + tests := []struct { + ext string + want string + }{ + {"geoarrow.point", "POINT"}, + {"geoarrow.linestring", "LINESTRING"}, + {"geoarrow.polygon", "POLYGON"}, + {"geoarrow.multipoint", "MULTIPOINT"}, + {"geoarrow.multilinestring", "MULTILINESTRING"}, + {"geoarrow.multipolygon", "MULTIPOLYGON"}, + } + + for _, tt := range tests { + t.Run(tt.ext, func(t *testing.T) { + got, err := CastArrowIngestValueToDuckDB(field, arrow.Field{ + Name: "geom", + Type: geoArrowTestType(tt.ext), + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), + }, "geom") + if err != nil { + t.Fatal(err) + } + if got == "geom" { + t.Fatalf("expected explicit conversion, got raw column") + } + if !strings.Contains(got, "ST_GeomFromText(") || !strings.Contains(got, tt.want) { + t.Fatalf("unexpected conversion for %s: %s", tt.ext, got) + } + }) + } +} + +func TestPostgresArrowIngestCastsNativeGeoArrowAsEWKT(t *testing.T) { + field := geometryTestField("4326") + + tests := []struct { + ext string + want string + }{ + {"geoarrow.point", "POINT"}, + {"geoarrow.linestring", "LINESTRING"}, + {"geoarrow.polygon", "POLYGON"}, + {"geoarrow.multipoint", "MULTIPOINT"}, + {"geoarrow.multilinestring", "MULTILINESTRING"}, + {"geoarrow.multipolygon", "MULTIPOLYGON"}, + } + + for _, tt := range tests { + t.Run(tt.ext, func(t *testing.T) { + got, err := CastArrowIngestValueToPostgres(field, arrow.Field{ + Name: "geom", + Type: geoArrowTestType(tt.ext), + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), + }, "geom") + if err != nil { + t.Fatal(err) + } + if got == "geom" { + t.Fatalf("expected explicit conversion, got raw column") + } + if !strings.Contains(got, "'SRID=4326;' || ") || !strings.Contains(got, tt.want) { + t.Fatalf("unexpected conversion for %s: %s", tt.ext, got) + } + }) + } +} + +func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { + field := geometryTestField("") + for _, ext := range []string{"geoarrow.geometry", "geoarrow.geometrycollection"} { + t.Run(ext, func(t *testing.T) { + _, err := CastArrowIngestValueToDuckDB(field, arrow.Field{ + Name: "geom", + Type: arrow.StructOf(), + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": ext}), + }, "geom") + if err == nil { + t.Fatalf("expected %s to be rejected", ext) + } + }) + } +} + +func TestPostgresArrowIngestSQLValueUsesDuckDBStagingLiterals(t *testing.T) { + engine := &Postgres{} + + jsonSQL, err := engine.ArrowIngestSQLValue(nil, map[string]any{"status": "ok"}) + if err != nil { + t.Fatal(err) + } + if strings.Contains(jsonSQL, "JSONB") || !strings.Contains(jsonSQL, "::JSON") { + t.Fatalf("expected DuckDB JSON literal, got %s", jsonSQL) + } + + geomSQL, err := engine.ArrowIngestSQLValue(geometryTestField("4326"), orb.Point{1, 2}) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(geomSQL, "'SRID=4326;' || ") || !strings.Contains(geomSQL, "POINT") { + t.Fatalf("expected Postgres EWKT literal, got %s", geomSQL) + } +} + +func geometryTestField(srid string) *ast.Field { + def := &ast.FieldDefinition{ + Name: "geom", + Type: ast.NamedType(base.GeometryTypeName, nil), + } + if srid != "" { + def.Directives = ast.DirectiveList{ + &ast.Directive{ + Name: base.FieldGeometryInfoDirectiveName, + Arguments: ast.ArgumentList{ + &ast.Argument{Name: base.ArgSRID, Value: &ast.Value{Raw: srid}}, + }, + }, + } + } + return &ast.Field{ + Name: "geom", + Alias: "geom", + Definition: def, + } +} + +func geoArrowTestType(ext string) arrow.DataType { + point := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64}, + ) + switch ext { + case "geoarrow.point": + return point + case "geoarrow.linestring", "geoarrow.multipoint": + return arrow.ListOf(point) + case "geoarrow.polygon", "geoarrow.multilinestring": + return arrow.ListOf(arrow.ListOf(point)) + case "geoarrow.multipolygon": + return arrow.ListOf(arrow.ListOf(arrow.ListOf(point))) + default: + return point + } +} diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index cf5fca1e..99bd9c70 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -90,6 +90,10 @@ func (e *DuckDB) CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, return CastArrowIngestValueToDuckDB(field, arrowField, sql) } +func (e *DuckDB) ArrowIngestSQLValue(_ *ast.Field, value any) (string, error) { + return e.SQLValue(value) +} + func CastArrowIngestValueToDuckDB(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { if field == nil || field.Definition == nil { return sql, nil @@ -116,20 +120,19 @@ func CastArrowIngestValueToDuckDB(field *ast.Field, arrowField arrow.Field, sql func castArrowGeometryToDuckDB(field arrow.Field, sql string) (string, error) { switch arrowExtensionName(field) { case "geoarrow.wkb": - if arrowFieldIsExtensionType(field) { - return sql, nil - } - return "ST_GeomFromWKB(" + sql + ")", nil + return "ST_GeomFromText(ST_AsText(" + sql + "), true)", nil case "geoarrow.wkt": return "ST_GeomFromText(" + sql + ", true)", nil case "hugr.geojson", "geoarrow.geojson", "geojson": return "ST_GeomFromGeoJSON(" + sql + ")", nil - case "geoarrow.point": - return duckDBGeoArrowPoint(sql), nil case "geoarrow.linestring", "geoarrow.polygon", "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", - "geoarrow.geometry", "geoarrow.geometrycollection": - return sql, nil + "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": + wkt, err := duckDBGeoArrowNativeWKT(arrowExtensionName(field), sql) + if err != nil { + return "", err + } + return "ST_GeomFromText(" + wkt + ", true)", nil } switch field.Type.ID() { diff --git a/pkg/engines/ducklake.go b/pkg/engines/ducklake.go index fed1902e..f8ca61e5 100644 --- a/pkg/engines/ducklake.go +++ b/pkg/engines/ducklake.go @@ -31,6 +31,7 @@ func (e *DuckLake) Capabilities() *compiler.EngineCapabilities { dbCaps := e.duckdb.Capabilities() caps := *dbCaps // defensive copy caps.General.SupportTimeTravel = true + caps.Insert.Ingest = false return &caps } diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index 60a56ca8..662edd75 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -75,7 +75,13 @@ type EngineTypeCaster interface { type EngineArrowIngestCaster interface { Engine + // CastArrowIngestValue returns a SQL expression evaluated by the DuckDB + // Arrow staging query and shaped for insertion into this target engine. CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) + // ArrowIngestSQLValue returns a DuckDB-staging SQL literal shaped for this + // target engine. It is used for permission/default values mixed into the + // INSERT ... SELECT built from an Arrow view. + ArrowIngestSQLValue(field *ast.Field, value any) (string, error) } type EngineVectorDistanceCalculator interface { diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 00dc3edc..26e8a6a2 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -599,6 +599,26 @@ func (e *Postgres) CastArrowIngestValue(field *ast.Field, arrowField arrow.Field return CastArrowIngestValueToPostgres(field, arrowField, sql) } +func (e *Postgres) ArrowIngestSQLValue(field *ast.Field, value any) (string, error) { + if value == nil { + return "NULL", nil + } + if field != nil && field.Definition != nil && field.Definition.Type.Name() == base.GeometryTypeName { + geom, err := ctypes.ParseGeometryValue(value) + if err != nil { + return "", err + } + if geom == nil { + return "NULL", nil + } + srid := base.FieldDefDirectiveArgString(field.Definition, base.FieldGeometryInfoDirectiveName, base.ArgSRID) + wktValue := strings.ReplaceAll(string(wkt.Marshal(geom)), "'", "''") + return postgresWKTText("'"+wktValue+"'", srid), nil + } + var duckdb DuckDB + return duckdb.SQLValue(value) +} + func CastArrowIngestValueToPostgres(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { if field == nil || field.Definition == nil { return sql, nil @@ -620,20 +640,19 @@ func castArrowGeometryToPostgres(field *ast.Field, arrowField arrow.Field, sql s } switch arrowExtensionName(arrowField) { case "geoarrow.wkb": - if arrowFieldIsExtensionType(arrowField) { - return postgresGeometryText(sql, srid), nil - } - return postgresGeometryText("ST_GeomFromWKB("+sql+")", srid), nil + return postgresGeometryText(sql, srid), nil case "geoarrow.wkt": return postgresWKTText(sql, srid), nil case "hugr.geojson", "geoarrow.geojson", "geojson": return postgresGeometryText("ST_GeomFromGeoJSON("+sql+")", srid), nil - case "geoarrow.point": - return postgresGeometryText(duckDBGeoArrowPoint(sql), srid), nil case "geoarrow.linestring", "geoarrow.polygon", "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", - "geoarrow.geometry", "geoarrow.geometrycollection": - return postgresGeometryText(sql, srid), nil + "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": + wkt, err := duckDBGeoArrowNativeWKT(arrowExtensionName(arrowField), sql) + if err != nil { + return "", err + } + return postgresWKTText(wkt, srid), nil } switch arrowField.Type.ID() { diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index de27feb1..42b4a703 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -43,6 +43,10 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if caps := engine.Capabilities(); caps == nil || !caps.Insert.Ingest { return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) } + ingestEngine, ok := engine.(engines.EngineArrowIngestCaster) + if !ok { + return nil, fmt.Errorf("engine %q declares IPC ingest support but does not implement Arrow ingest casting", engine.Type()) + } mutation := sdl.MutationInfo(ctx, provider, mutationField) if mutation == nil || mutation.Type != sdl.MutationTypeInsert { return nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) @@ -59,7 +63,7 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if err != nil { return nil, err } - return ingestNode(ctx, info, mutation, engine, columns, permissionData), nil + return ingestNode(ctx, info, mutation, ingestEngine, columns, permissionData), nil } func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { @@ -224,25 +228,15 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info return permissionData, nil } -func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.Engine, columns []ingestColumn, permissionData map[string]any) *QueryPlanNode { +func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.EngineArrowIngestCaster, columns []ingestColumn, permissionData map[string]any) *QueryPlanNode { return &QueryPlanNode{ Name: "ingest_" + info.Name, CollectFunc: func(node *QueryPlanNode, children Results, params []any) (string, []any, error) { fieldValues := make(map[string]string, len(columns)) for _, c := range columns { value := engines.Ident(c.ArrowField.Name) - field := &ast.Field{ - Name: c.Field.Name, - Alias: c.Field.Name, - Definition: c.FieldDef, - ObjectDefinition: info.Definition(), - } - var err error - if caster, ok := engine.(engines.EngineArrowIngestCaster); ok { - value, err = caster.CastArrowIngestValue(field, c.ArrowField, value) - } else { - value, err = engines.CastArrowIngestValueToDuckDB(field, c.ArrowField, value) - } + field := ingestASTField(info, c.Field, c.FieldDef) + value, err := engine.CastArrowIngestValue(field, c.ArrowField, value) if err != nil { return "", nil, err } @@ -256,13 +250,21 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e if fieldInfo.IsReferencesSubquery() || fieldInfo.IsNotDBField() { return "", nil, fmt.Errorf("permission data field %q cannot be ingested directly", name) } - sqlValue, err := engine.SQLValue(value) + fieldDef := info.Definition().Fields.ForName(name) + if fieldDef == nil { + return "", nil, fmt.Errorf("permission data field %q definition not found in data object %q", name, info.Name) + } + sqlValue, err := engine.ArrowIngestSQLValue(ingestASTField(info, fieldInfo, fieldDef), value) if err != nil { return "", nil, err } fieldValues[name] = sqlValue } - if err := mutation.AppendInsertSQLExpression(fieldValues, perm.AuthVars(ctx), engine); err != nil { + // Arrow ingest SELECT expressions are evaluated by DuckDB because + // the temporary Arrow view is registered on a DuckDB connection. + // Target engines shape column values above; default/auth helper + // expressions must still be valid in the DuckDB staging SELECT. + if err := mutation.AppendInsertSQLExpression(fieldValues, perm.AuthVars(ctx), engines.NewArrowIngestStagingBuilder()); err != nil { return "", nil, err } @@ -298,3 +300,12 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e }, } } + +func ingestASTField(info *sdl.Object, fieldInfo *sdl.Field, fieldDef *ast.FieldDefinition) *ast.Field { + return &ast.Field{ + Name: fieldInfo.Name, + Alias: fieldInfo.Name, + Definition: fieldDef, + ObjectDefinition: info.Definition(), + } +} From 01be265f41b79ff9fdb1becad15874a1ba58e5db Mon Sep 17 00:00:00 2001 From: vadim Date: Tue, 9 Jun 2026 23:22:29 +0400 Subject: [PATCH 20/36] ipc ingest --- pkg/db/pool.go | 1 + pkg/engines/arrow_ingest.go | 47 +++++++++++++++++++++ pkg/engines/arrow_ingest_test.go | 16 +++---- pkg/engines/duckdb.go | 55 ++++-------------------- pkg/engines/engines.go | 13 +++--- pkg/engines/postgres.go | 52 ++++++----------------- pkg/planner/node_arrow_ingest.go | 72 +++++++++++++++++++++++++++++++- 7 files changed, 153 insertions(+), 103 deletions(-) diff --git a/pkg/db/pool.go b/pkg/db/pool.go index cfdd2830..8d2b59cf 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -230,6 +230,7 @@ func (p *Pool) Arrow(ctx context.Context) (*Arrow, error) { // ExecWithArrowView registers reader as TempArrowViewName and executes query on // the same DuckDB driver connection, where the temporary Arrow view is visible. +// todo rename => ExecWithArrow / ExecArrow func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, query string) (sql.Result, error) { if reader == nil { return nil, fmt.Errorf("missing arrow reader") diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index 0495229c..6a086b9d 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -27,6 +27,53 @@ func (b *ArrowIngestStagingBuilder) FunctionCall(name string, positional []any, return b.duckdb.FunctionCall(name, positional, named) } +func duckDBArrowJSONExpr(arrowField arrow.Field, sourceExpr string) string { + switch arrowField.Type.ID() { + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW, + arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: + return "try_cast(" + sourceExpr + " AS JSON)" + case arrow.STRUCT, arrow.LIST, arrow.LARGE_LIST, arrow.FIXED_SIZE_LIST, + arrow.LIST_VIEW, arrow.LARGE_LIST_VIEW, arrow.MAP: + return "to_json(" + sourceExpr + ")" + default: + return sourceExpr + } +} + +func duckDBArrowGeometryExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + wktExpr, err := duckDBArrowGeometryWKTExpr(arrowField, sourceExpr) + if err != nil { + return "", err + } + return "ST_GeomFromText(" + wktExpr + ", true)", nil +} + +func duckDBArrowGeometryWKTExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + switch arrowExtensionName(arrowField) { + case "geoarrow.wkb": + return "ST_AsText(" + sourceExpr + ")", nil + case "geoarrow.wkt": + return sourceExpr, nil + case "hugr.geojson", "geoarrow.geojson", "geojson": + return "ST_AsText(ST_GeomFromGeoJSON(" + sourceExpr + "))", nil + case "geoarrow.linestring", "geoarrow.polygon", + "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", + "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": + return duckDBGeoArrowNativeWKT(arrowExtensionName(arrowField), sourceExpr) + } + + switch arrowField.Type.ID() { + case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: + return "ST_AsText(ST_GeomFromWKB(" + sourceExpr + "))", nil + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: + return "CASE WHEN starts_with(trim(" + sourceExpr + "), '{') THEN ST_AsText(ST_GeomFromGeoJSON(" + sourceExpr + ")) ELSE " + sourceExpr + " END", nil + case arrow.STRUCT, arrow.MAP: + return "ST_AsText(ST_GeomFromGeoJSON(to_json(" + sourceExpr + ")::VARCHAR))", nil + default: + return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Geometry without geoarrow/hugr metadata", arrowField.Name, arrowField.Type) + } +} + func arrowExtensionName(field arrow.Field) string { if extType, ok := field.Type.(arrow.ExtensionType); ok { return strings.ToLower(extType.ExtensionName()) diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go index d62ed654..4398d0cd 100644 --- a/pkg/engines/arrow_ingest_test.go +++ b/pkg/engines/arrow_ingest_test.go @@ -10,7 +10,7 @@ import ( "github.com/vektah/gqlparser/v2/ast" ) -func TestDuckDBArrowIngestCastsNativeGeoArrowExplicitly(t *testing.T) { +func TestDuckDBArrowIngestBuildsNativeGeoArrowSelectExpr(t *testing.T) { field := geometryTestField("") tests := []struct { @@ -27,7 +27,7 @@ func TestDuckDBArrowIngestCastsNativeGeoArrowExplicitly(t *testing.T) { for _, tt := range tests { t.Run(tt.ext, func(t *testing.T) { - got, err := CastArrowIngestValueToDuckDB(field, arrow.Field{ + got, err := duckDBArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: geoArrowTestType(tt.ext), Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), @@ -45,7 +45,7 @@ func TestDuckDBArrowIngestCastsNativeGeoArrowExplicitly(t *testing.T) { } } -func TestPostgresArrowIngestCastsNativeGeoArrowAsEWKT(t *testing.T) { +func TestPostgresArrowIngestBuildsNativeGeoArrowEWKTSelectExpr(t *testing.T) { field := geometryTestField("4326") tests := []struct { @@ -62,7 +62,7 @@ func TestPostgresArrowIngestCastsNativeGeoArrowAsEWKT(t *testing.T) { for _, tt := range tests { t.Run(tt.ext, func(t *testing.T) { - got, err := CastArrowIngestValueToPostgres(field, arrow.Field{ + got, err := postgresArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: geoArrowTestType(tt.ext), Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), @@ -84,7 +84,7 @@ func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { field := geometryTestField("") for _, ext := range []string{"geoarrow.geometry", "geoarrow.geometrycollection"} { t.Run(ext, func(t *testing.T) { - _, err := CastArrowIngestValueToDuckDB(field, arrow.Field{ + _, err := duckDBArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: arrow.StructOf(), Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": ext}), @@ -96,10 +96,10 @@ func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { } } -func TestPostgresArrowIngestSQLValueUsesDuckDBStagingLiterals(t *testing.T) { +func TestPostgresArrowIngestLiteralExprUsesDuckDBStagingLiterals(t *testing.T) { engine := &Postgres{} - jsonSQL, err := engine.ArrowIngestSQLValue(nil, map[string]any{"status": "ok"}) + jsonSQL, err := engine.ArrowIngestLiteralExpr(nil, map[string]any{"status": "ok"}) if err != nil { t.Fatal(err) } @@ -107,7 +107,7 @@ func TestPostgresArrowIngestSQLValueUsesDuckDBStagingLiterals(t *testing.T) { t.Fatalf("expected DuckDB JSON literal, got %s", jsonSQL) } - geomSQL, err := engine.ArrowIngestSQLValue(geometryTestField("4326"), orb.Point{1, 2}) + geomSQL, err := engine.ArrowIngestLiteralExpr(geometryTestField("4326"), orb.Point{1, 2}) if err != nil { t.Fatal(err) } diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index 99bd9c70..adfa51c4 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -86,64 +86,25 @@ func (e *DuckDB) Capabilities() *compiler.EngineCapabilities { } } -func (e *DuckDB) CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { - return CastArrowIngestValueToDuckDB(field, arrowField, sql) +func (e *DuckDB) ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { + return duckDBArrowIngestSelectExpr(field, arrowField, sourceExpr) } -func (e *DuckDB) ArrowIngestSQLValue(_ *ast.Field, value any) (string, error) { +func (e *DuckDB) ArrowIngestLiteralExpr(_ *ast.Field, value any) (string, error) { return e.SQLValue(value) } -func CastArrowIngestValueToDuckDB(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { +func duckDBArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { if field == nil || field.Definition == nil { - return sql, nil + return sourceExpr, nil } switch field.Definition.Type.Name() { case base.JSONTypeName: - switch arrowField.Type.ID() { - case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW, - arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: - return "try_cast(" + sql + " AS JSON)", nil - case arrow.STRUCT, arrow.LIST, arrow.LARGE_LIST, arrow.FIXED_SIZE_LIST, - arrow.LIST_VIEW, arrow.LARGE_LIST_VIEW, arrow.MAP: - return "to_json(" + sql + ")", nil - default: - return sql, nil - } + return duckDBArrowJSONExpr(arrowField, sourceExpr), nil case base.GeometryTypeName: - return castArrowGeometryToDuckDB(arrowField, sql) - default: - return sql, nil - } -} - -func castArrowGeometryToDuckDB(field arrow.Field, sql string) (string, error) { - switch arrowExtensionName(field) { - case "geoarrow.wkb": - return "ST_GeomFromText(ST_AsText(" + sql + "), true)", nil - case "geoarrow.wkt": - return "ST_GeomFromText(" + sql + ", true)", nil - case "hugr.geojson", "geoarrow.geojson", "geojson": - return "ST_GeomFromGeoJSON(" + sql + ")", nil - case "geoarrow.linestring", "geoarrow.polygon", - "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", - "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": - wkt, err := duckDBGeoArrowNativeWKT(arrowExtensionName(field), sql) - if err != nil { - return "", err - } - return "ST_GeomFromText(" + wkt + ", true)", nil - } - - switch field.Type.ID() { - case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: - return "ST_GeomFromWKB(" + sql + ")", nil - case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: - return "CASE WHEN starts_with(trim(" + sql + "), '{') THEN ST_GeomFromGeoJSON(" + sql + ") ELSE ST_GeomFromText(" + sql + ", true) END", nil - case arrow.STRUCT, arrow.MAP: - return "ST_GeomFromGeoJSON(to_json(" + sql + ")::VARCHAR)", nil + return duckDBArrowGeometryExpr(arrowField, sourceExpr) default: - return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Geometry without geoarrow/hugr metadata", field.Name, field.Type) + return sourceExpr, nil } } diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index 662edd75..37820786 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -75,13 +75,12 @@ type EngineTypeCaster interface { type EngineArrowIngestCaster interface { Engine - // CastArrowIngestValue returns a SQL expression evaluated by the DuckDB - // Arrow staging query and shaped for insertion into this target engine. - CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) - // ArrowIngestSQLValue returns a DuckDB-staging SQL literal shaped for this - // target engine. It is used for permission/default values mixed into the - // INSERT ... SELECT built from an Arrow view. - ArrowIngestSQLValue(field *ast.Field, value any) (string, error) + // ArrowIngestSelectExpr returns a DuckDB-compatible SELECT expression for + // one Arrow-view column, shaped for insertion into this target engine. + ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) + // ArrowIngestLiteralExpr returns a DuckDB-compatible literal/expression for + // non-Arrow values mixed into the ingest SELECT, shaped for this target. + ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) } type EngineVectorDistanceCalculator interface { diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 26e8a6a2..f5cfede7 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -595,11 +595,11 @@ func (e *Postgres) CastFromIntermediateType(f *ast.Field, toJSON bool) (string, return Ident(f.Alias), nil } -func (e *Postgres) CastArrowIngestValue(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { - return CastArrowIngestValueToPostgres(field, arrowField, sql) +func (e *Postgres) ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { + return postgresArrowIngestSelectExpr(field, arrowField, sourceExpr) } -func (e *Postgres) ArrowIngestSQLValue(field *ast.Field, value any) (string, error) { +func (e *Postgres) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) { if value == nil { return "NULL", nil } @@ -619,56 +619,30 @@ func (e *Postgres) ArrowIngestSQLValue(field *ast.Field, value any) (string, err return duckdb.SQLValue(value) } -func CastArrowIngestValueToPostgres(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { +func postgresArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { if field == nil || field.Definition == nil { - return sql, nil + return sourceExpr, nil } switch field.Definition.Type.Name() { case base.JSONTypeName: - return CastArrowIngestValueToDuckDB(field, arrowField, sql) + return duckDBArrowJSONExpr(arrowField, sourceExpr), nil case base.GeometryTypeName: - return castArrowGeometryToPostgres(field, arrowField, sql) + return postgresArrowGeometryWKTExpr(field, arrowField, sourceExpr) default: - return sql, nil + return sourceExpr, nil } } -func castArrowGeometryToPostgres(field *ast.Field, arrowField arrow.Field, sql string) (string, error) { +func postgresArrowGeometryWKTExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { srid := "" if field != nil && field.Definition != nil { srid = base.FieldDefDirectiveArgString(field.Definition, base.FieldGeometryInfoDirectiveName, base.ArgSRID) } - switch arrowExtensionName(arrowField) { - case "geoarrow.wkb": - return postgresGeometryText(sql, srid), nil - case "geoarrow.wkt": - return postgresWKTText(sql, srid), nil - case "hugr.geojson", "geoarrow.geojson", "geojson": - return postgresGeometryText("ST_GeomFromGeoJSON("+sql+")", srid), nil - case "geoarrow.linestring", "geoarrow.polygon", - "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", - "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": - wkt, err := duckDBGeoArrowNativeWKT(arrowExtensionName(arrowField), sql) - if err != nil { - return "", err - } - return postgresWKTText(wkt, srid), nil - } - - switch arrowField.Type.ID() { - case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: - return postgresGeometryText("ST_GeomFromWKB("+sql+")", srid), nil - case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: - return "CASE WHEN starts_with(trim(" + sql + "), '{') THEN " + postgresGeometryText("ST_GeomFromGeoJSON("+sql+")", srid) + " ELSE " + postgresWKTText(sql, srid) + " END", nil - case arrow.STRUCT, arrow.MAP: - return postgresGeometryText("ST_GeomFromGeoJSON(to_json("+sql+")::VARCHAR)", srid), nil - default: - return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Postgres Geometry without geoarrow/hugr metadata", arrowField.Name, arrowField.Type) + wktExpr, err := duckDBArrowGeometryWKTExpr(arrowField, sourceExpr) + if err != nil { + return "", err } -} - -func postgresGeometryText(sql, srid string) string { - return postgresWKTText("ST_AsText("+sql+")", srid) + return postgresWKTText(wktExpr, srid), nil } func postgresWKTText(sql, srid string) string { diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 42b4a703..2051085d 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -111,35 +111,62 @@ func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObj return info, mutationField, nil } +// resolveIngestColumns matches Arrow IPC stream columns to the GraphQL insert +// contract for the target table data object. +// +// - provider resolves GraphQL definitions generated by the catalog compiler. +// - info describes the GraphQL data object and its DB table/field mapping. +// - mutation describes the GraphQL insert mutation and insertable fields. +// - schema is the Arrow IPC schema supplied by the request body. +// +// The returned ingestColumn values keep all three names/spaces together: +// Arrow field, GraphQL object/input field, and DB table column mapping. func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutation *sdl.Mutation, schema *arrow.Schema) ([]ingestColumn, error) { if schema == nil { return nil, fmt.Errorf("arrow stream has no schema") } + // GraphQL input type accepted by the insert mutation, for example + // events_mut_input_data. Arrow columns must be valid fields of this input. inputName := info.InputInsertDataName() if inputName == "" { return nil, fmt.Errorf("data object %q has no insert input type", info.Name) } + // GraphQL definition of the insert input object. This is not the DB table; + // it is the public mutation contract used for permission/input validation. input := provider.ForName(ctx, inputName) if input == nil { return nil, fmt.Errorf("insert input type %q not found", inputName) } + // seen detects duplicate Arrow column names before they collapse into the + // byName lookup map below. seen := map[string]struct{}{} + // byName stores the resolved ingest metadata keyed by GraphQL/Arrow field + // name. Field source directives may later map this name to another DB column. byName := make(map[string]ingestColumn, schema.NumFields()) for _, af := range schema.Fields() { + // af is the physical Arrow IPC column. Its name is matched against the + // GraphQL insert input and data object field names. if _, dup := seen[af.Name]; dup { return nil, fmt.Errorf("duplicate arrow column %q", af.Name) } seen[af.Name] = struct{}{} + // inputField is the GraphQL mutation input field. If it is absent, the + // client is trying to ingest a column that the insert API does not expose. inputField := input.Fields.ForName(af.Name) if inputField == nil { return nil, fmt.Errorf("column %q is not defined in insert input %q", af.Name, inputName) } + // objectField is the GraphQL field on the table data object. It carries + // type information and directives such as @geometry_info / @field_source. objectField := info.Definition().Fields.ForName(af.Name) if objectField == nil { return nil, fmt.Errorf("column %q is not defined in data object %q", af.Name, info.Definition().Name) } + // fieldInfo is the compiled catalog view of objectField. It knows whether + // the GraphQL field is a real DB field, a reference, or a computed field, + // and how it maps to the table column. fieldInfo := info.FieldForName(af.Name) if fieldInfo == nil { return nil, fmt.Errorf("column %q is not defined in data object %q", af.Name, info.Definition().Name) @@ -161,6 +188,9 @@ func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info * } } + // Check required GraphQL insert fields that were not supplied by Arrow. + // Required fields are allowed to be omitted only when the DB/catalog can + // provide them through a sequence, default insert expression, or @default. for _, fieldInfo := range mutation.Fields() { if _, ok := byName[fieldInfo.Name]; ok { continue @@ -178,6 +208,8 @@ func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info * return nil, fmt.Errorf("field %q is required for ingest into %q", fieldInfo.Name, info.Name) } + // Preserve Arrow stream column order for the SELECT list. The map above is + // only for validation and required-field checks. columns := make([]ingestColumn, 0, len(byName)) for _, af := range schema.Fields() { columns = append(columns, byName[af.Name]) @@ -228,21 +260,47 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info return permissionData, nil } +// ingestNode builds the INSERT ... SELECT statement that copies rows from the +// temporary DuckDB Arrow view into the target DB table. +// +// - info is the GraphQL data object plus its DB table/column mapping. +// - mutation is the GraphQL insert mutation used for insert defaults. +// - engine converts Arrow-view expressions into values accepted by the +// target engine/table. +// - columns are Arrow columns already resolved to GraphQL fields and DB +// columns by resolveIngestColumns. +// - permissionData contains extra GraphQL input values injected by the +// permission layer; they do not come from the Arrow stream. func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.EngineArrowIngestCaster, columns []ingestColumn, permissionData map[string]any) *QueryPlanNode { return &QueryPlanNode{ Name: "ingest_" + info.Name, CollectFunc: func(node *QueryPlanNode, children Results, params []any) (string, []any, error) { + // fieldValues is keyed by GraphQL field name. Each value is a SQL + // expression evaluated in the SELECT part of INSERT ... SELECT. + // The expression may reference an Arrow column from the temporary + // DuckDB view, or it may be a constant/default/permission value. fieldValues := make(map[string]string, len(columns)) for _, c := range columns { + // c.ArrowField.Name is the physical Arrow view column name. + // It is not necessarily the final DB column name; @field_source + // is applied later when targetFields is built. value := engines.Ident(c.ArrowField.Name) + // Synthetic GraphQL field used only to pass type/directive + // metadata to the engine-specific Arrow ingest caster. field := ingestASTField(info, c.Field, c.FieldDef) - value, err := engine.CastArrowIngestValue(field, c.ArrowField, value) + // Build the Arrow ingest SELECT expression for the target + // GraphQL/DB field. + // Examples: JSON to_json(...), Geometry ST_GeomFromText(...), + // or Postgres EWKT text for PostGIS columns. + value, err := engine.ArrowIngestSelectExpr(field, c.ArrowField, value) if err != nil { return "", nil, err } fieldValues[c.Field.Name] = value } for name, value := range permissionData { + // Permission data is addressed by GraphQL input/object field + // name, then converted to a staging SQL literal/expression. fieldInfo := info.FieldForName(name) if fieldInfo == nil { return "", nil, fmt.Errorf("permission data field %q is not defined in data object %q", name, info.Name) @@ -254,7 +312,9 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e if fieldDef == nil { return "", nil, fmt.Errorf("permission data field %q definition not found in data object %q", name, info.Name) } - sqlValue, err := engine.ArrowIngestSQLValue(ingestASTField(info, fieldInfo, fieldDef), value) + // Unlike Arrow columns, this value has no Arrow type. The target + // engine still decides how to shape the literal for ingest. + sqlValue, err := engine.ArrowIngestLiteralExpr(ingestASTField(info, fieldInfo, fieldDef), value) if err != nil { return "", nil, err } @@ -270,11 +330,17 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e var targetFields, selectExprs []string for _, c := range columns { + // targetFields are DB table columns. FieldSourceName applies the + // catalog mapping from GraphQL field name to physical DB column. targetFields = append(targetFields, c.Field.FieldSourceName("", true)) + // selectExprs are evaluated from the DuckDB Arrow view and must + // stay in the same order as targetFields. selectExprs = append(selectExprs, fieldValues[c.Field.Name]) delete(fieldValues, c.Field.Name) } for _, fieldInfo := range mutation.Fields() { + // Remaining fieldValues are values not backed by Arrow columns: + // permission data and default insert expressions. expr, ok := fieldValues[fieldInfo.Name] if !ok { continue @@ -291,6 +357,8 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e } target := info.SQL(ctx, engines.Ident(info.Catalog)) + // The FROM relation is the fixed temporary Arrow view registered on + // the same DuckDB connection that executes this statement. return fmt.Sprintf("INSERT INTO %s (%s) SELECT %s FROM %s", target, strings.Join(targetFields, ", "), From a3e4f11cdb5bf87b900a27ecaa67e3e0515e99c1 Mon Sep 17 00:00:00 2001 From: vadim Date: Wed, 10 Jun 2026 11:55:34 +0400 Subject: [PATCH 21/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 123 ++++++++++++++++++ .../ingest-postgres/ingest_postgres_test.go | 123 ++++++++++++++++++ 2 files changed, 246 insertions(+) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index a78a6045..d4ca00ba 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -879,6 +879,30 @@ func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { }, got) } +func TestIngest_HTTP_GeometryTypes_ReadThroughHugr_DuckDB(t *testing.T) { + env := setupEnv(t) + + rec, schema := makeGeometryTypesRecord(t, []string{"geo-read-a", "geo-read-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err := http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { like: "geo-read-%" } }`, []map[string]any{ + geometryReadExpected("geo-read-a", [2]float64{30.5, 50.25}, 0, 0), + geometryReadExpected("geo-read-b", [2]float64{-73.935242, 40.730610}, 1, 1), + }) +} + func TestIngest_HTTP_GeometryTypes_Bulk50k_DuckDB(t *testing.T) { env := setupEnv(t) @@ -948,6 +972,9 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k_DuckDB(t *testing.T) { values[i] = compactWKT(values[i]) } assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) + assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { eq: "dk-geo-bulk-049999" } }`, []map[string]any{ + geometryReadExpected("dk-geo-bulk-049999", [2]float64{99, 49}, 99, 49), + }) elapsed := time.Since(start) t.Logf("geometry bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", @@ -1035,6 +1062,102 @@ func geometryExpected(point, x, y string) []string { } } +func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, filter string, expected []map[string]any) { + t.Helper() + + query := fmt.Sprintf(`{ + %s { + events(%s, order_by: [{field: "name", direction: ASC}]) { + name + geom + geom_wkt + geom_geojson + geom_wkb + geom_line + geom_polygon_native + geom_multipoint + geom_multiline + geom_multipolygon + } + } + }`, dsName, filter) + + res, err := service.Query(context.Background(), query, nil) + require.NoError(t, err) + defer res.Close() + require.NoErrorf(t, res.Err(), "graphql error for query: %s", query) + + body, err := json.Marshal(res) + require.NoError(t, err) + + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + data, ok := payload["data"].(map[string]any) + require.True(t, ok, "response data must be an object: %s", string(body)) + root, ok := data[dsName].(map[string]any) + require.True(t, ok, "response data.%s must be an object: %s", dsName, string(body)) + rawRows, ok := root["events"].([]any) + require.True(t, ok, "response data.%s.events must be an array: %s", dsName, string(body)) + + got := make([]map[string]any, 0, len(rawRows)) + for _, raw := range rawRows { + row, ok := raw.(map[string]any) + require.True(t, ok, "event row must be an object: %#v", raw) + got = append(got, row) + } + assert.Equal(t, expected, got) +} + +func geometryReadExpected(name string, point [2]float64, x, y float64) map[string]any { + return map[string]any{ + "name": name, + "geom": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), + "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_wkb": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), + "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_multipoint": geoJSONGeometry("MultiPoint", pointCoordinates(multiPoints(x, y))), + "geom_multiline": geoJSONGeometry("MultiLineString", nestedPointCoordinates(multiLines(x, y))), + "geom_multipolygon": geoJSONGeometry("MultiPolygon", deepPointCoordinates(multiPolygons(x, y))), + } +} + +func geoJSONGeometry(typ string, coordinates any) map[string]any { + return map[string]any{ + "type": typ, + "coordinates": coordinates, + } +} + +func pointCoordinate(point xyPoint) []any { + return []any{point[0], point[1]} +} + +func pointCoordinates(points []xyPoint) []any { + coords := make([]any, 0, len(points)) + for _, point := range points { + coords = append(coords, pointCoordinate(point)) + } + return coords +} + +func nestedPointCoordinates(lines [][]xyPoint) []any { + coords := make([]any, 0, len(lines)) + for _, line := range lines { + coords = append(coords, pointCoordinates(line)) + } + return coords +} + +func deepPointCoordinates(polygons [][][]xyPoint) []any { + coords := make([]any, 0, len(polygons)) + for _, polygon := range polygons { + coords = append(coords, nestedPointCoordinates(polygon)) + } + return coords +} + func addCoord(v string, delta float64) string { f, err := strconv.ParseFloat(v, 64) if err != nil { diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index a59272c6..6a50ae55 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -518,6 +518,102 @@ func geometryExpected(point, x, y string) []string { } } +func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, filter string, expected []map[string]any) { + t.Helper() + + query := fmt.Sprintf(`{ + %s { + events(%s, order_by: [{field: "name", direction: ASC}]) { + name + geom + geom_wkt + geom_geojson + geom_wkb + geom_line + geom_polygon_native + geom_multipoint + geom_multiline + geom_multipolygon + } + } + }`, dsName, filter) + + res, err := service.Query(context.Background(), query, nil) + require.NoError(t, err) + defer res.Close() + require.NoErrorf(t, res.Err(), "graphql error for query: %s", query) + + body, err := json.Marshal(res) + require.NoError(t, err) + + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + data, ok := payload["data"].(map[string]any) + require.True(t, ok, "response data must be an object: %s", string(body)) + root, ok := data[dsName].(map[string]any) + require.True(t, ok, "response data.%s must be an object: %s", dsName, string(body)) + rawRows, ok := root["events"].([]any) + require.True(t, ok, "response data.%s.events must be an array: %s", dsName, string(body)) + + got := make([]map[string]any, 0, len(rawRows)) + for _, raw := range rawRows { + row, ok := raw.(map[string]any) + require.True(t, ok, "event row must be an object: %#v", raw) + got = append(got, row) + } + assert.Equal(t, expected, got) +} + +func geometryReadExpected(name string, point [2]float64, x, y float64) map[string]any { + return map[string]any{ + "name": name, + "geom": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), + "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_wkb": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), + "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_multipoint": geoJSONGeometry("MultiPoint", pointCoordinates(multiPoints(x, y))), + "geom_multiline": geoJSONGeometry("MultiLineString", nestedPointCoordinates(multiLines(x, y))), + "geom_multipolygon": geoJSONGeometry("MultiPolygon", deepPointCoordinates(multiPolygons(x, y))), + } +} + +func geoJSONGeometry(typ string, coordinates any) map[string]any { + return map[string]any{ + "type": typ, + "coordinates": coordinates, + } +} + +func pointCoordinate(point xyPoint) []any { + return []any{point[0], point[1]} +} + +func pointCoordinates(points []xyPoint) []any { + coords := make([]any, 0, len(points)) + for _, point := range points { + coords = append(coords, pointCoordinate(point)) + } + return coords +} + +func nestedPointCoordinates(lines [][]xyPoint) []any { + coords := make([]any, 0, len(lines)) + for _, line := range lines { + coords = append(coords, pointCoordinates(line)) + } + return coords +} + +func deepPointCoordinates(polygons [][][]xyPoint) []any { + coords := make([]any, 0, len(polygons)) + for _, polygon := range polygons { + coords = append(coords, nestedPointCoordinates(polygon)) + } + return coords +} + func addCoord(v string, delta float64) string { f, err := strconv.ParseFloat(v, 64) if err != nil { @@ -1279,6 +1375,30 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { }, gotSRID) } +func TestIngest_HTTP_GeometryTypes_ReadThroughHugr(t *testing.T) { + env := setupEnv(t) + + rec, schema := makeGeometryTypesRecord(t, []string{"geo-read-a", "geo-read-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err := http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { like: "geo-read-%" } }`, []map[string]any{ + geometryReadExpected("geo-read-a", [2]float64{30.5, 50.25}, 0, 0), + geometryReadExpected("geo-read-b", [2]float64{-73.935242, 40.730610}, 1, 1), + }) +} + func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { env := setupEnv(t) @@ -1355,6 +1475,9 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { } assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) assert.Equal(t, []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, srids) + assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { eq: "pg-geo-bulk-049999" } }`, []map[string]any{ + geometryReadExpected("pg-geo-bulk-049999", [2]float64{99, 49}, 99, 49), + }) elapsed := time.Since(start) t.Logf("geometry bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", From 78553b5a94821b9e1abd273576596df602017a6d Mon Sep 17 00:00:00 2001 From: vadim Date: Wed, 10 Jun 2026 22:34:36 +0400 Subject: [PATCH 22/36] ipc ingest --- pkg/planner/node_arrow_ingest.go | 70 +++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 2051085d..5a3b365c 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -52,15 +52,18 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata return nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) } - columns, err := resolveIngestColumns(ctx, provider, info, mutation, reader.Schema()) + permissionData, err := resolveIngestPermissionData(ctx, provider, info, mutationField) + if err != nil { + return nil, err + } + columns, err := resolveIngestColumns(ctx, provider, info, mutation, reader.Schema(), permissionData) if err != nil { return nil, err } if len(columns) == 0 { return nil, fmt.Errorf("no insertable columns matched between arrow stream and data object") } - permissionData, err := checkIngestPermissions(ctx, provider, info, mutationField, columns) - if err != nil { + if err := checkIngestPermissions(ctx, provider, info, columns, permissionData); err != nil { return nil, err } return ingestNode(ctx, info, mutation, ingestEngine, columns, permissionData), nil @@ -118,10 +121,12 @@ func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObj // - info describes the GraphQL data object and its DB table/field mapping. // - mutation describes the GraphQL insert mutation and insertable fields. // - schema is the Arrow IPC schema supplied by the request body. +// - permissionData contains fields injected by permissions; required fields +// can be satisfied by either Arrow columns or these injected values. // // The returned ingestColumn values keep all three names/spaces together: // Arrow field, GraphQL object/input field, and DB table column mapping. -func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutation *sdl.Mutation, schema *arrow.Schema) ([]ingestColumn, error) { +func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutation *sdl.Mutation, schema *arrow.Schema, permissionData map[string]any) ([]ingestColumn, error) { if schema == nil { return nil, fmt.Errorf("arrow stream has no schema") } @@ -195,6 +200,9 @@ func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info * if _, ok := byName[fieldInfo.Name]; ok { continue } + if _, ok := permissionData[fieldInfo.Name]; ok { + continue + } if !fieldInfo.IsRequired() { continue } @@ -217,7 +225,7 @@ func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info * return columns, nil } -func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutationField *ast.FieldDefinition, columns []ingestColumn) (map[string]any, error) { +func resolveIngestPermissionData(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutationField *ast.FieldDefinition) (map[string]any, error) { if auth.IsFullAccess(ctx) { return nil, nil } @@ -234,30 +242,46 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info return nil, auth.ErrForbidden } - data := make(map[string]any, len(columns)) + arg := rp.DataArgument(ctx, parent, mutationField.Name) + if arg == nil { + return nil, nil + } + values, err := sdl.ParseDataAsInputObject(ctx, provider, &ast.Type{ + NamedType: info.InputInsertDataName(), + Position: base.CompiledPos("ingest permission data"), + }, arg, false) + if err != nil { + return nil, err + } + if values == nil { + return nil, nil + } + return values.(map[string]any), nil +} + +func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info *sdl.Object, columns []ingestColumn, permissionData map[string]any) error { + if auth.IsFullAccess(ctx) { + return nil + } + rp := perm.PermissionsFromCtx(ctx) + if rp == nil { + return nil + } + if rp.Disabled { + return auth.ErrForbidden + } + + data := make(map[string]any, len(columns)+len(permissionData)) for _, c := range columns { data[c.InputDef.Name] = nil } - var permissionData map[string]any - if arg := rp.DataArgument(ctx, parent, mutationField.Name); arg != nil { - values, err := sdl.ParseDataAsInputObject(ctx, provider, &ast.Type{ - NamedType: info.InputInsertDataName(), - Position: base.CompiledPos("ingest permission data"), - }, arg, false) - if err != nil { - return nil, err - } - if values != nil { - permissionData = values.(map[string]any) - for k, v := range permissionData { - data[k] = v - } - } + for k, v := range permissionData { + data[k] = v } if err := rp.CheckMutationInput(ctx, provider, info.InputInsertDataName(), data); err != nil { - return nil, err + return err } - return permissionData, nil + return nil } // ingestNode builds the INSERT ... SELECT statement that copies rows from the From 6da7dc8bf5477df7f3373f80dace4362644cf2fc Mon Sep 17 00:00:00 2001 From: vadim Date: Wed, 10 Jun 2026 22:43:50 +0400 Subject: [PATCH 23/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 91 ++++++++++++++++ .../schemas/duck_ingest/schema.graphql | 1 + .../ingest-postgres/ingest_postgres_test.go | 101 +++++++++++++++++- .../ingest-postgres/testdata/init.sql | 1 + .../testdata/schemas/pg_ingest/schema.graphql | 1 + 5 files changed, 193 insertions(+), 2 deletions(-) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index d4ca00ba..bdbe17d6 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -38,6 +38,8 @@ import ( "github.com/hugr-lab/query-engine/pkg/db" ) +const ingestTestAPIKey = "ingest-test-api-key" + // ingestEnv is per-test state on top of a shared hugr.Service (initialised // once in TestMain). Each test owns a unique .duckdb file and a unique data // source name, so tests don't share table state. Cleanup unloads the source @@ -70,6 +72,10 @@ func TestMain(m *testing.M) { CoreDB: coredb.New(coredb.Config{}), Auth: &auth.Config{ Providers: []auth.AuthProvider{ + auth.NewApiKey("ingest-test", auth.ApiKeyConfig{ + Key: ingestTestAPIKey, + DefaultRole: "admin", + }), auth.NewAnonymous(auth.AnonymousConfig{ Allowed: true, Role: "admin", @@ -126,6 +132,7 @@ func setupEnv(t *testing.T) *ingestEnv { name VARCHAR NOT NULL, value DOUBLE NOT NULL, is_active BOOLEAN NOT NULL DEFAULT true, + owner_id BIGINT, payload JSON, created_at TIMESTAMPTZ NOT NULL DEFAULT now(), geom GEOMETRY, @@ -202,6 +209,40 @@ func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, var res.Close() } +func registerIngestPermissionRole(t *testing.T, service *hugr.Service, role, mutationModule string) { + t.Helper() + ctx := context.Background() + mustQuery(t, ctx, service, `mutation($role: core_roles_mut_input_data!, $allowAll: core_role_permissions_mut_input_data!, $inject: core_role_permissions_mut_input_data!) { + core { + insert_roles(data: $role) { name } + allow_all: insert_role_permissions(data: $allowAll) { role type_name field_name } + inject_owner: insert_role_permissions(data: $inject) { role type_name field_name } + } + }`, map[string]any{ + "role": map[string]any{ + "name": role, + "description": "IPC ingest permission data integration test role", + }, + "allowAll": map[string]any{ + "role": role, + "type_name": "*", + "field_name": "*", + }, + "inject": map[string]any{ + "role": role, + "type_name": mutationModule, + "field_name": "insert_events", + "data": map[string]any{ + "owner_id": "[$auth.user_id_int]", + }, + }, + }) +} + +func moduleMutationName(module string) string { + return "_module_" + strings.ReplaceAll(module, ".", "_") + "_mutation" +} + func makeEventsRecord(t *testing.T, names []string, values []float64, active []bool, payload []string, created []arrow.Timestamp) arrow.RecordBatch { t.Helper() pool := memory.NewGoAllocator() @@ -289,6 +330,56 @@ func TestIngest_DuckDB_RoundTrip(t *testing.T) { assert.Equal(t, []bool{true, false, true}, gotHasJSON) } +func TestIngest_DuckDB_PermissionData(t *testing.T) { + env := setupEnv(t) + + const ownerID = 4242 + role := "ingest_perm_" + env.dsName + registerIngestPermissionRole(t, env.service, role, moduleMutationName(env.dsName)) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"perm-alpha", "perm-beta"}, + []float64{11.5, 12.5}, + []bool{true, true}, + []string{"", ""}, + []arrow.Timestamp{now, now}, + ) + defer rec.Release() + + permClient := hugrclient.NewClient(env.server.URL+"/ipc", + hugrclient.WithApiKey(ingestTestAPIKey), + hugrclient.WithUserRole(role), + hugrclient.WithUserInfo(strconv.Itoa(ownerID), "permission-user"), + ) + res, err := permClient.IngestRecord(context.Background(), env.dataObject, rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + assert.NotContains(t, res.Columns, "owner_id", "owner_id must be injected by permissions, not sent in Arrow") + + ro := env.openRO(t) + defer ro.Close() + rows, err := ro.Query("SELECT name, owner_id FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]int64{} + for rows.Next() { + var ( + name string + ownerID int64 + ) + require.NoError(t, rows.Scan(&name, &ownerID)) + got[name] = ownerID + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]int64{ + "perm-alpha": ownerID, + "perm-beta": ownerID, + }, got) +} + func TestIngest_DuckDB_UnknownColumn(t *testing.T) { env := setupEnv(t) diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql index 71eb7262..9a288279 100644 --- a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -3,6 +3,7 @@ type events @table(name: "events") { name: String! value: Float! is_active: Boolean! @default(value: "true") + owner_id: BigInt payload: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 4326, type: POINT) diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 6a50ae55..295a280b 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -39,8 +39,9 @@ import ( ) const ( - envPostgresDSN = "INGEST_POSTGRES_DSN" - envSchemasPath = "HUGR_INGEST_SCHEMAS_PATH" + envPostgresDSN = "INGEST_POSTGRES_DSN" + envSchemasPath = "HUGR_INGEST_SCHEMAS_PATH" + ingestTestAPIKey = "ingest-test-api-key" ) // ingestEnv is per-test view on top of a shared hugr.Service (initialised @@ -91,6 +92,10 @@ func TestMain(m *testing.M) { CoreDB: coredb.New(coredb.Config{}), Auth: &auth.Config{ Providers: []auth.AuthProvider{ + auth.NewApiKey("ingest-test", auth.ApiKeyConfig{ + Key: ingestTestAPIKey, + DefaultRole: "admin", + }), auth.NewAnonymous(auth.AnonymousConfig{ Allowed: true, Role: "admin", @@ -181,6 +186,50 @@ func setupEnv(t *testing.T) *ingestEnv { } } +func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, vars map[string]any) { + t.Helper() + res, err := s.Query(ctx, q, vars) + require.NoError(t, err) + if res.Err() != nil { + require.NoErrorf(t, res.Err(), "graphql error for query: %s", q) + } + res.Close() +} + +func registerIngestPermissionRole(t *testing.T, service *hugr.Service, role, mutationModule string) { + t.Helper() + ctx := context.Background() + mustQuery(t, ctx, service, `mutation($role: core_roles_mut_input_data!, $allowAll: core_role_permissions_mut_input_data!, $inject: core_role_permissions_mut_input_data!) { + core { + insert_roles(data: $role) { name } + allow_all: insert_role_permissions(data: $allowAll) { role type_name field_name } + inject_owner: insert_role_permissions(data: $inject) { role type_name field_name } + } + }`, map[string]any{ + "role": map[string]any{ + "name": role, + "description": "IPC ingest permission data integration test role", + }, + "allowAll": map[string]any{ + "role": role, + "type_name": "*", + "field_name": "*", + }, + "inject": map[string]any{ + "role": role, + "type_name": mutationModule, + "field_name": "insert_events", + "data": map[string]any{ + "owner_id": "[$auth.user_id_int]", + }, + }, + }) +} + +func moduleMutationName(module string) string { + return "_module_" + strings.ReplaceAll(module, ".", "_") + "_mutation" +} + // makeEventsRecord builds a single Arrow RecordBatch with the columns of the // pg_ingest.events table (excluding id, which is autogen). func makeEventsRecord(t *testing.T, names []string, values []float64, active []bool, payload []string, created []arrow.Timestamp) arrow.RecordBatch { @@ -264,6 +313,54 @@ func TestIngest_Postgres_RoundTrip(t *testing.T) { assert.Equal(t, []bool{true, false, true}, gotHasJSON) // beta has NULL payload } +func TestIngest_Postgres_PermissionData(t *testing.T) { + env := setupEnv(t) + + const ownerID = 4343 + role := "ingest_perm_pg" + registerIngestPermissionRole(t, env.service, role, moduleMutationName(env.dsName)) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"perm-alpha", "perm-beta"}, + []float64{11.5, 12.5}, + []bool{true, true}, + []string{"", ""}, + []arrow.Timestamp{now, now}, + ) + defer rec.Release() + + permClient := hugrclient.NewClient(env.server.URL+"/ipc", + hugrclient.WithApiKey(ingestTestAPIKey), + hugrclient.WithUserRole(role), + hugrclient.WithUserInfo(strconv.Itoa(ownerID), "permission-user"), + ) + res, err := permClient.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + assert.NotContains(t, res.Columns, "owner_id", "owner_id must be injected by permissions, not sent in Arrow") + + rows, err := env.pgConn.Query("SELECT name, owner_id FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]int64{} + for rows.Next() { + var ( + name string + ownerID int64 + ) + require.NoError(t, rows.Scan(&name, &ownerID)) + got[name] = ownerID + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]int64{ + "perm-alpha": ownerID, + "perm-beta": ownerID, + }, got) +} + func TestIngest_Postgres_MultipleBatches(t *testing.T) { env := setupEnv(t) diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index db56f54b..d7b3cf14 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -10,6 +10,7 @@ CREATE TABLE events ( name VARCHAR NOT NULL, value DOUBLE PRECISION NOT NULL, is_active BOOLEAN NOT NULL DEFAULT true, + owner_id BIGINT, payload JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), geom GEOMETRY(Point, 4326), diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index 71eb7262..9a288279 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -3,6 +3,7 @@ type events @table(name: "events") { name: String! value: Float! is_active: Boolean! @default(value: "true") + owner_id: BigInt payload: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 4326, type: POINT) From c11bff453b3ee946a067a33f92bd9c2f4d09a184 Mon Sep 17 00:00:00 2001 From: vadim Date: Fri, 12 Jun 2026 00:18:51 +0400 Subject: [PATCH 24/36] ipc ingest --- ipc-ingest.go | 6 ++- pkg/arrow-ingest/source.go | 68 ++++++++++++++++++++++++++++++++ pkg/db/pool.go | 44 ++++----------------- pkg/planner/node_arrow_ingest.go | 17 ++++---- pkg/planner/planer.go | 11 +++--- 5 files changed, 94 insertions(+), 52 deletions(-) create mode 100644 pkg/arrow-ingest/source.go diff --git a/ipc-ingest.go b/ipc-ingest.go index 7432d44b..6400e9d4 100644 --- a/ipc-ingest.go +++ b/ipc-ingest.go @@ -10,6 +10,7 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/ipc" "github.com/apache/arrow-go/v18/arrow/memory" + arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" "github.com/hugr-lab/query-engine/pkg/auth" "github.com/hugr-lab/query-engine/pkg/perm" ) @@ -74,8 +75,9 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { return } defer reader.Release() + source := arrowingest.NewSource(reader) - plan, err := s.planner.PlanArrowIngest(ctx, s.schema.Provider(), dataObject, reader) + plan, err := s.planner.PlanArrowIngest(ctx, s.schema.Provider(), dataObject, source) if err != nil { if errors.Is(err, auth.ErrForbidden) { writeIngestError(w, http.StatusForbidden, err.Error()) @@ -93,7 +95,7 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { writeIngestError(w, http.StatusInternalServerError, "arrow ingest plan produced SQL parameters") return } - res, err := s.db.ExecWithArrowView(ctx, reader, plan.CompiledQuery) + res, err := s.db.ExecArrowIngest(ctx, source, plan.CompiledQuery) if err != nil { writeIngestError(w, http.StatusInternalServerError, err.Error()) return diff --git a/pkg/arrow-ingest/source.go b/pkg/arrow-ingest/source.go new file mode 100644 index 00000000..07a9eacf --- /dev/null +++ b/pkg/arrow-ingest/source.go @@ -0,0 +1,68 @@ +package arrowingest + +import ( + "fmt" + "strings" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" +) + +const DefaultViewName = "_hugr_arrow_view" + +// Source is the shared contract between the IPC ingest handler, planner, and +// DB executor. The planner builds SQL against ViewName; the DB executor +// registers Reader under the same per-connection DuckDB view name. +type Source struct { + Reader array.RecordReader + ViewName string +} + +func NewSource(reader array.RecordReader) Source { + return Source{ + Reader: reader, + ViewName: DefaultViewName, + } +} + +func (s Source) View() string { + if s.ViewName == "" { + return DefaultViewName + } + return s.ViewName +} + +// NeedsSpatial reports whether the Arrow source carries geometry extension +// metadata that requires DuckDB's spatial extension before registering the view. +func (s Source) NeedsSpatial() bool { + if s.Reader == nil || s.Reader.Schema() == nil { + return false + } + for _, f := range s.Reader.Schema().Fields() { + if extType, ok := f.Type.(arrow.ExtensionType); ok && isGeometryArrowExtension(extType.ExtensionName()) { + return true + } + if ext, ok := f.Metadata.GetValue("ARROW:extension:name"); ok && isGeometryArrowExtension(ext) { + return true + } + if ext, ok := f.Metadata.GetValue("extension:name"); ok && isGeometryArrowExtension(ext) { + return true + } + } + return false +} + +// RegisterView registers the source reader under the source view name. +func (s Source) RegisterView(arrowConn interface { + RegisterView(reader array.RecordReader, viewName string) (func(), error) +}) (func(), error) { + if s.Reader == nil { + return nil, fmt.Errorf("missing arrow reader") + } + return arrowConn.RegisterView(s.Reader, s.View()) +} + +func isGeometryArrowExtension(ext string) bool { + ext = strings.ToLower(ext) + return strings.HasPrefix(ext, "geoarrow.") || ext == "hugr.geojson" || ext == "geojson" +} diff --git a/pkg/db/pool.go b/pkg/db/pool.go index 8d2b59cf..096eb6a6 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -8,16 +8,10 @@ import ( "strings" "sync" - "github.com/apache/arrow-go/v18/arrow" - "github.com/apache/arrow-go/v18/arrow/array" "github.com/duckdb/duckdb-go/v2" + arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" ) -// TempArrowViewName is the fixed per-connection view name used by -// ExecWithArrowView. DuckDB views registered from Arrow readers are scoped to -// the driver connection, so a stable name is safe across concurrent requests. -const TempArrowViewName = "_hugr_arrow_view" - type Config struct { Path string `json:"path"` MaxOpenConns int `json:"max_open_conns"` @@ -228,11 +222,10 @@ func (p *Pool) Arrow(ctx context.Context) (*Arrow, error) { }, nil } -// ExecWithArrowView registers reader as TempArrowViewName and executes query on -// the same DuckDB driver connection, where the temporary Arrow view is visible. -// todo rename => ExecWithArrow / ExecArrow -func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, query string) (sql.Result, error) { - if reader == nil { +// ExecArrowIngest registers source.Reader as source.ViewName and executes query +// on the same DuckDB driver connection, where the temporary Arrow view is visible. +func (p *Pool) ExecArrowIngest(ctx context.Context, source arrowingest.Source, query string) (sql.Result, error) { + if source.Reader == nil { return nil, fmt.Errorf("missing arrow reader") } ar, err := p.Arrow(ctx) @@ -245,12 +238,12 @@ func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, if !ok { return nil, fmt.Errorf("duckdb driver connection does not implement ExecerContext") } - if arrowViewNeedsSpatial(reader) { + if source.NeedsSpatial() { if _, err := execer.ExecContext(ctx, "LOAD spatial", nil); err != nil { return nil, fmt.Errorf("prepare spatial arrow view: %w", err) } } - release, err := ar.RegisterView(reader, TempArrowViewName) + release, err := source.RegisterView(ar) if err != nil { return nil, fmt.Errorf("register arrow view: %w", err) } @@ -259,29 +252,6 @@ func (p *Pool) ExecWithArrowView(ctx context.Context, reader array.RecordReader, return execer.ExecContext(ctx, query, nil) } -func arrowViewNeedsSpatial(reader array.RecordReader) bool { - if reader == nil || reader.Schema() == nil { - return false - } - for _, f := range reader.Schema().Fields() { - if extType, ok := f.Type.(arrow.ExtensionType); ok && isGeometryArrowExtension(extType.ExtensionName()) { - return true - } - if ext, ok := f.Metadata.GetValue("ARROW:extension:name"); ok && isGeometryArrowExtension(ext) { - return true - } - if ext, ok := f.Metadata.GetValue("extension:name"); ok && isGeometryArrowExtension(ext) { - return true - } - } - return false -} - -func isGeometryArrowExtension(ext string) bool { - ext = strings.ToLower(ext) - return strings.HasPrefix(ext, "geoarrow.") || ext == "hugr.geojson" || ext == "geojson" -} - func (p *Pool) RegisterScalarFunction(ctx context.Context, function ScalarFunction) error { return RegisterScalarFunction(ctx, p, function) } diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 5a3b365c..46df8992 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -6,12 +6,11 @@ import ( "strings" "github.com/apache/arrow-go/v18/arrow" - "github.com/apache/arrow-go/v18/arrow/array" + arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" "github.com/hugr-lab/query-engine/pkg/auth" "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" - "github.com/hugr-lab/query-engine/pkg/db" "github.com/hugr-lab/query-engine/pkg/engines" "github.com/hugr-lab/query-engine/pkg/perm" "github.com/vektah/gqlparser/v2/ast" @@ -24,11 +23,11 @@ type ingestColumn struct { InputDef *ast.FieldDefinition } -func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, reader array.RecordReader) (*QueryPlanNode, error) { +func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, source arrowingest.Source) (*QueryPlanNode, error) { if dataObject == "" { return nil, fmt.Errorf("missing data object") } - if reader == nil { + if source.Reader == nil { return nil, fmt.Errorf("missing arrow reader") } @@ -56,7 +55,7 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if err != nil { return nil, err } - columns, err := resolveIngestColumns(ctx, provider, info, mutation, reader.Schema(), permissionData) + columns, err := resolveIngestColumns(ctx, provider, info, mutation, source.Reader.Schema(), permissionData) if err != nil { return nil, err } @@ -66,7 +65,7 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if err := checkIngestPermissions(ctx, provider, info, columns, permissionData); err != nil { return nil, err } - return ingestNode(ctx, info, mutation, ingestEngine, columns, permissionData), nil + return ingestNode(ctx, info, mutation, ingestEngine, columns, permissionData, source.View()), nil } func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { @@ -295,7 +294,9 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info // columns by resolveIngestColumns. // - permissionData contains extra GraphQL input values injected by the // permission layer; they do not come from the Arrow stream. -func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.EngineArrowIngestCaster, columns []ingestColumn, permissionData map[string]any) *QueryPlanNode { +// - arrowViewName is the per-connection DuckDB view registered from the +// Arrow reader during execution. +func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.EngineArrowIngestCaster, columns []ingestColumn, permissionData map[string]any, arrowViewName string) *QueryPlanNode { return &QueryPlanNode{ Name: "ingest_" + info.Name, CollectFunc: func(node *QueryPlanNode, children Results, params []any) (string, []any, error) { @@ -387,7 +388,7 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e target, strings.Join(targetFields, ", "), strings.Join(selectExprs, ", "), - engines.Ident(db.TempArrowViewName), + engines.Ident(arrowViewName), ), params, nil }, } diff --git a/pkg/planner/planer.go b/pkg/planner/planer.go index be9353c1..0fd0a74d 100644 --- a/pkg/planner/planer.go +++ b/pkg/planner/planer.go @@ -4,7 +4,7 @@ import ( "context" "errors" - "github.com/apache/arrow-go/v18/arrow/array" + arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" "github.com/hugr-lab/query-engine/pkg/engines" @@ -68,10 +68,11 @@ func (s *Service) Plan(ctx context.Context, provider catalog.Provider, query *as } // PlanArrowIngest builds an INSERT-from-Arrow-view plan for the target data object. -// The Arrow reader is part of this planning API because its schema drives column -// resolution and ingest casting, while execution registers it as a temporary view. -func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, reader array.RecordReader) (*QueryPlan, error) { - node, err := ingestRootNode(ctx, provider, s.engines, dataObject, reader) +// The Arrow source is part of this planning API because its schema drives column +// resolution and ingest casting, while its view name is the staging relation used +// in the generated INSERT ... SELECT. +func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, source arrowingest.Source) (*QueryPlan, error) { + node, err := ingestRootNode(ctx, provider, s.engines, dataObject, source) if err != nil { return nil, err } From b9fb01209b5b8c438acf281324ba36beaa39467a Mon Sep 17 00:00:00 2001 From: vadim Date: Fri, 12 Jun 2026 11:17:14 +0400 Subject: [PATCH 25/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 62 +++++++++++++++++- .../ingest-postgres/ingest_postgres_test.go | 64 ++++++++++++++++++- pkg/engines/duckdb.go | 15 ++++- pkg/perm/permissions.go | 7 +- 4 files changed, 139 insertions(+), 9 deletions(-) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index bdbe17d6..9a47e7eb 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -210,6 +210,13 @@ func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, var } func registerIngestPermissionRole(t *testing.T, service *hugr.Service, role, mutationModule string) { + t.Helper() + registerIngestPermissionRoleData(t, service, role, mutationModule, map[string]any{ + "owner_id": "[$auth.user_id_int]", + }) +} + +func registerIngestPermissionRoleData(t *testing.T, service *hugr.Service, role, mutationModule string, data map[string]any) { t.Helper() ctx := context.Background() mustQuery(t, ctx, service, `mutation($role: core_roles_mut_input_data!, $allowAll: core_role_permissions_mut_input_data!, $inject: core_role_permissions_mut_input_data!) { @@ -232,9 +239,7 @@ func registerIngestPermissionRole(t *testing.T, service *hugr.Service, role, mut "role": role, "type_name": mutationModule, "field_name": "insert_events", - "data": map[string]any{ - "owner_id": "[$auth.user_id_int]", - }, + "data": data, }, }) } @@ -380,6 +385,57 @@ func TestIngest_DuckDB_PermissionData(t *testing.T) { }, got) } +func TestIngest_DuckDB_PermissionDataGeometry(t *testing.T) { + env := setupEnv(t) + + role := "ingest_perm_geom_" + env.dsName + registerIngestPermissionRoleData(t, env.service, role, moduleMutationName(env.dsName), map[string]any{ + "geom": "POINT (7.25 8.5)", + }) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"perm-geom-alpha", "perm-geom-beta"}, + []float64{21.5, 22.5}, + []bool{true, true}, + []string{"", ""}, + []arrow.Timestamp{now, now}, + ) + defer rec.Release() + + permClient := hugrclient.NewClient(env.server.URL+"/ipc", + hugrclient.WithApiKey(ingestTestAPIKey), + hugrclient.WithUserRole(role), + hugrclient.WithUserInfo("7", "permission-geometry-user"), + ) + res, err := permClient.IngestRecord(context.Background(), env.dataObject, rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + assert.NotContains(t, res.Columns, "geom", "geom must be injected by permissions, not sent in Arrow") + + ro := env.openRO(t) + defer ro.Close() + _, err = ro.Exec("LOAD spatial") + require.NoError(t, err) + + rows, err := ro.Query("SELECT name, ST_AsText(geom) FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]string{} + for rows.Next() { + var name, geom string + require.NoError(t, rows.Scan(&name, &geom)) + got[name] = compactWKT(geom) + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]string{ + "perm-geom-alpha": "POINT(7.25 8.5)", + "perm-geom-beta": "POINT(7.25 8.5)", + }, got) +} + func TestIngest_DuckDB_UnknownColumn(t *testing.T) { env := setupEnv(t) diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 295a280b..0db34784 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -197,6 +197,13 @@ func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, var } func registerIngestPermissionRole(t *testing.T, service *hugr.Service, role, mutationModule string) { + t.Helper() + registerIngestPermissionRoleData(t, service, role, mutationModule, map[string]any{ + "owner_id": "[$auth.user_id_int]", + }) +} + +func registerIngestPermissionRoleData(t *testing.T, service *hugr.Service, role, mutationModule string, data map[string]any) { t.Helper() ctx := context.Background() mustQuery(t, ctx, service, `mutation($role: core_roles_mut_input_data!, $allowAll: core_role_permissions_mut_input_data!, $inject: core_role_permissions_mut_input_data!) { @@ -219,9 +226,7 @@ func registerIngestPermissionRole(t *testing.T, service *hugr.Service, role, mut "role": role, "type_name": mutationModule, "field_name": "insert_events", - "data": map[string]any{ - "owner_id": "[$auth.user_id_int]", - }, + "data": data, }, }) } @@ -361,6 +366,59 @@ func TestIngest_Postgres_PermissionData(t *testing.T) { }, got) } +func TestIngest_Postgres_PermissionDataGeometry(t *testing.T) { + env := setupEnv(t) + + role := "ingest_perm_geom_pg" + registerIngestPermissionRoleData(t, env.service, role, moduleMutationName(env.dsName), map[string]any{ + "geom": "POINT (7.25 8.5)", + }) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"perm-geom-alpha", "perm-geom-beta"}, + []float64{21.5, 22.5}, + []bool{true, true}, + []string{"", ""}, + []arrow.Timestamp{now, now}, + ) + defer rec.Release() + + permClient := hugrclient.NewClient(env.server.URL+"/ipc", + hugrclient.WithApiKey(ingestTestAPIKey), + hugrclient.WithUserRole(role), + hugrclient.WithUserInfo("7", "permission-geometry-user"), + ) + res, err := permClient.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + assert.NotContains(t, res.Columns, "geom", "geom must be injected by permissions, not sent in Arrow") + + rows, err := env.pgConn.Query("SELECT name, ST_AsText(geom), ST_SRID(geom) FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]string{} + gotSRID := map[string]int{} + for rows.Next() { + var name, geom string + var srid int + require.NoError(t, rows.Scan(&name, &geom, &srid)) + got[name] = compactWKT(geom) + gotSRID[name] = srid + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]string{ + "perm-geom-alpha": "POINT(7.25 8.5)", + "perm-geom-beta": "POINT(7.25 8.5)", + }, got) + assert.Equal(t, map[string]int{ + "perm-geom-alpha": 4326, + "perm-geom-beta": 4326, + }, gotSRID) +} + func TestIngest_Postgres_MultipleBatches(t *testing.T) { env := setupEnv(t) diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index adfa51c4..f12f3397 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -90,7 +90,20 @@ func (e *DuckDB) ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, return duckDBArrowIngestSelectExpr(field, arrowField, sourceExpr) } -func (e *DuckDB) ArrowIngestLiteralExpr(_ *ast.Field, value any) (string, error) { +func (e *DuckDB) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) { + if value == nil { + return "NULL", nil + } + if field != nil && field.Definition != nil && field.Definition.Type.Name() == base.GeometryTypeName { + geom, err := ctypes.ParseGeometryValue(value) + if err != nil { + return "", err + } + if geom == nil { + return "NULL", nil + } + return e.SQLValue(geom) + } return e.SQLValue(value) } diff --git a/pkg/perm/permissions.go b/pkg/perm/permissions.go index a3dd0624..d11d8e77 100644 --- a/pkg/perm/permissions.go +++ b/pkg/perm/permissions.go @@ -8,9 +8,9 @@ import ( "github.com/vektah/gqlparser/v2/ast" "github.com/hugr-lab/query-engine/pkg/auth" - "github.com/hugr-lab/query-engine/pkg/engines" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" + "github.com/hugr-lab/query-engine/pkg/engines" ) type RolePermissions struct { @@ -54,7 +54,6 @@ func (r *RolePermissions) CheckQuery(query *ast.Field) error { return nil } - func (r *RolePermissions) CheckMutationInput(ctx context.Context, defs base.DefinitionsSource, inputName string, data map[string]any) error { if r.Disabled { return auth.ErrForbidden @@ -153,11 +152,15 @@ func applyContextVariable(ctx context.Context, data map[string]any, vars map[str v[i] = applyContextVariable(ctx, vv, vars) } } + res[k] = v case string: if val, ok := vars[v]; ok { res[k] = val continue } + res[k] = v + default: + res[k] = v } } From 4bb2c42821ee52a1914775f9daa6a94d8a5f336d Mon Sep 17 00:00:00 2001 From: vadim Date: Fri, 12 Jun 2026 11:47:16 +0400 Subject: [PATCH 26/36] ipc ingest --- pkg/engines/arrow_ingest.go | 23 +++++++++++++++++++-- pkg/engines/arrow_ingest_test.go | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index 6a086b9d..51a53808 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -49,7 +49,19 @@ func duckDBArrowGeometryExpr(arrowField arrow.Field, sourceExpr string) (string, } func duckDBArrowGeometryWKTExpr(arrowField arrow.Field, sourceExpr string) (string, error) { - switch arrowExtensionName(arrowField) { + if ext := arrowExtensionName(arrowField); ext != "" { + return duckDBArrowGeometryWKTExprFromTrustedExtension(ext, sourceExpr) + } + return duckDBArrowGeometryWKTExprFromPhysicalType(arrowField, sourceExpr) +} + +// duckDBArrowGeometryWKTExprFromTrustedExtension uses GeoArrow/Hugr extension +// metadata as the source of truth for geometry semantics. The physical Arrow +// storage type is intentionally not used as a fallback once extension metadata +// is present; unsupported metadata should fail during planning instead of being +// guessed from Type.ID(). +func duckDBArrowGeometryWKTExprFromTrustedExtension(ext, sourceExpr string) (string, error) { + switch ext { case "geoarrow.wkb": return "ST_AsText(" + sourceExpr + ")", nil case "geoarrow.wkt": @@ -59,9 +71,16 @@ func duckDBArrowGeometryWKTExpr(arrowField arrow.Field, sourceExpr string) (stri case "geoarrow.linestring", "geoarrow.polygon", "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": - return duckDBGeoArrowNativeWKT(arrowExtensionName(arrowField), sourceExpr) + return duckDBGeoArrowNativeWKT(ext, sourceExpr) + default: + return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) } +} +// duckDBArrowGeometryWKTExprFromPhysicalType is the best-effort path for +// unannotated Arrow columns. Without extension metadata we infer common +// geometry encodings from physical Arrow storage. +func duckDBArrowGeometryWKTExprFromPhysicalType(arrowField arrow.Field, sourceExpr string) (string, error) { switch arrowField.Type.ID() { case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: return "ST_AsText(ST_GeomFromWKB(" + sourceExpr + "))", nil diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go index 4398d0cd..d9e36baa 100644 --- a/pkg/engines/arrow_ingest_test.go +++ b/pkg/engines/arrow_ingest_test.go @@ -96,6 +96,40 @@ func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { } } +func TestArrowIngestRejectsUnsupportedGeometryExtensionMetadata(t *testing.T) { + field := geometryTestField("") + for _, tt := range []struct { + name string + typ arrow.DataType + ext string + }{ + { + name: "string-like column does not fall back to WKT when metadata is unsupported", + typ: arrow.BinaryTypes.String, + ext: "geoarrow.curve", + }, + { + name: "binary-like column does not fall back to WKB when metadata is unsupported", + typ: arrow.BinaryTypes.Binary, + ext: "hugr.unknown_geometry", + }, + } { + t.Run(tt.name, func(t *testing.T) { + _, err := duckDBArrowIngestSelectExpr(field, arrow.Field{ + Name: "geom", + Type: tt.typ, + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), + }, "geom") + if err == nil { + t.Fatalf("expected unsupported extension %q to be rejected", tt.ext) + } + if !strings.Contains(err.Error(), "unsupported GeoArrow extension") { + t.Fatalf("unexpected error for %q: %v", tt.ext, err) + } + }) + } +} + func TestPostgresArrowIngestLiteralExprUsesDuckDBStagingLiterals(t *testing.T) { engine := &Postgres{} From a58c2bb2df675a4c779256bfeed8cb1415994fd5 Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 13 Jun 2026 14:05:27 +0400 Subject: [PATCH 27/36] ipc ingest experimental geom --- .gitignore | 1 + .../ingest-duckdb/ingest_duckdb_test.go | 117 ++++++++--- .../schemas/duck_ingest/schema.graphql | 2 + .../ingest-postgres/ingest_postgres_test.go | 134 +++++++++---- .../ingest-postgres/testdata/init.sql | 2 + .../testdata/schemas/pg_ingest/schema.graphql | 2 + pkg/engines/arrow_ingest.go | 113 +++++++++-- pkg/engines/arrow_ingest_test.go | 187 ++++++++++++++++-- pkg/engines/duckdb.go | 7 +- pkg/engines/engines.go | 8 +- pkg/engines/postgres.go | 2 +- pkg/planner/node_arrow_ingest.go | 4 +- 12 files changed, 483 insertions(+), 96 deletions(-) diff --git a/.gitignore b/.gitignore index 50268021..027a93ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .DS* .local .claude/ +.idea/ .specify/ specs/ design/ diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index 9a47e7eb..e2c05b89 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -138,6 +138,8 @@ func setupEnv(t *testing.T) *ingestEnv { geom GEOMETRY, geom_wkt GEOMETRY, geom_geojson GEOMETRY, + geom_hugr_geojson GEOMETRY, + geom_plain_geojson GEOMETRY, geom_wkb GEOMETRY, geom_line GEOMETRY, geom_polygon_native GEOMETRY, @@ -994,10 +996,12 @@ func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { require.NoError(t, err) rows, err := ro.Query(` - SELECT name, - ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), ST_AsText(geom_wkb), - ST_AsText(geom_line), ST_AsText(geom_polygon_native), ST_AsText(geom_multipoint), - ST_AsText(geom_multiline), ST_AsText(geom_multipolygon) + SELECT name, + ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), + ST_AsText(geom_hugr_geojson), ST_AsText(geom_plain_geojson), + ST_AsText(geom_wkb), + ST_AsText(geom_line), ST_AsText(geom_polygon_native), ST_AsText(geom_multipoint), + ST_AsText(geom_multiline), ST_AsText(geom_multipolygon) FROM events WHERE name LIKE 'geo-%' ORDER BY name @@ -1008,7 +1012,7 @@ func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { got := map[string][]string{} for rows.Next() { var name string - values := make([]string, 9) + values := make([]string, 11) scanArgs := []any{&name} for i := range values { scanArgs = append(scanArgs, &values[i]) @@ -1107,14 +1111,16 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k_DuckDB(t *testing.T) { require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'dk-geo-bulk-%'").Scan(&count)) assert.Equal(t, totalRows, count) - values := make([]string, 9) + values := make([]string, 11) require.NoError(t, ro.QueryRow(` - SELECT ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), ST_AsText(geom_wkb), + SELECT ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), + ST_AsText(geom_hugr_geojson), ST_AsText(geom_plain_geojson), + ST_AsText(geom_wkb), ST_AsText(geom_line), ST_AsText(geom_polygon_native), ST_AsText(geom_multipoint), ST_AsText(geom_multiline), ST_AsText(geom_multipolygon) FROM events WHERE name = 'dk-geo-bulk-049999' - `).Scan(&values[0], &values[1], &values[2], &values[3], &values[4], &values[5], &values[6], &values[7], &values[8])) + `).Scan(&values[0], &values[1], &values[2], &values[3], &values[4], &values[5], &values[6], &values[7], &values[8], &values[9], &values[10])) for i := range values { values[i] = compactWKT(values[i]) } @@ -1177,6 +1183,8 @@ func geometryTypesSchema() *arrow.Schema { {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, + {Name: "geom_hugr_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "hugr.geojson"})}, + {Name: "geom_plain_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geojson"})}, {Name: "geom_wkb", Type: arrow.BinaryTypes.Binary, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkb"})}, {Name: "geom_line", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.linestring"})}, {Name: "geom_polygon_native", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.polygon"})}, @@ -1189,7 +1197,8 @@ func geometryTypesSchema() *arrow.Schema { func geometryTypesColumns() []string { return []string{ "name", "value", "is_active", - "geom", "geom_wkt", "geom_geojson", "geom_wkb", + "geom", "geom_wkt", "geom_geojson", + "geom_hugr_geojson", "geom_plain_geojson", "geom_wkb", "geom_line", "geom_polygon_native", "geom_multipoint", "geom_multiline", "geom_multipolygon", } @@ -1199,16 +1208,53 @@ func geometryExpected(point, x, y string) []string { return []string{ point, fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), - fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + polygonWKT(x, y), + polygonWKT(x, y), + polygonWKT(x, y), point, fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), - fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + polygonWKT(x, y), fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y), fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)), - fmt.Sprintf("MULTIPOLYGON(((%s %s,%s %s,%s %s,%s %s,%s %s)))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + multiPolygonWKT(x, y), } } +func polygonWKT(x, y string) string { + return fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s))", + x, y, + x, addCoord(y, 4), + addCoord(x, 4), addCoord(y, 4), + addCoord(x, 4), y, + x, y, + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 1), + ) +} + +func multiPolygonWKT(x, y string) string { + return fmt.Sprintf("MULTIPOLYGON(((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s)),((%s %s,%s %s,%s %s,%s %s,%s %s)))", + x, y, + x, addCoord(y, 4), + addCoord(x, 4), addCoord(y, 4), + addCoord(x, 4), y, + x, y, + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 10), addCoord(y, 10), + addCoord(x, 10), addCoord(y, 12), + addCoord(x, 12), addCoord(y, 12), + addCoord(x, 12), addCoord(y, 10), + addCoord(x, 10), addCoord(y, 10), + ) +} + func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, filter string, expected []map[string]any) { t.Helper() @@ -1217,9 +1263,11 @@ func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, events(%s, order_by: [{field: "name", direction: ASC}]) { name geom - geom_wkt - geom_geojson - geom_wkb + geom_wkt + geom_geojson + geom_hugr_geojson + geom_plain_geojson + geom_wkb geom_line geom_polygon_native geom_multipoint @@ -1261,6 +1309,8 @@ func geometryReadExpected(name string, point [2]float64, x, y float64) map[strin "geom": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), "geom_wkb": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), @@ -1348,13 +1398,15 @@ func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, b.Field(4).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) b.Field(5).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + b.Field(6).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + b.Field(7).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) - b.Field(6).(*array.BinaryBuilder).Append(wkbPoint) - appendPointList(b.Field(7).(*array.ListBuilder), linePoints(shapeX, shapeY)) - appendPointListList(b.Field(8).(*array.ListBuilder), polygonRings(shapeX, shapeY)) - appendPointList(b.Field(9).(*array.ListBuilder), multiPoints(shapeX, shapeY)) - appendPointListList(b.Field(10).(*array.ListBuilder), multiLines(shapeX, shapeY)) - appendPointListListList(b.Field(11).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) + b.Field(8).(*array.BinaryBuilder).Append(wkbPoint) + appendPointList(b.Field(9).(*array.ListBuilder), linePoints(shapeX, shapeY)) + appendPointListList(b.Field(10).(*array.ListBuilder), polygonRings(shapeX, shapeY)) + appendPointList(b.Field(11).(*array.ListBuilder), multiPoints(shapeX, shapeY)) + appendPointListList(b.Field(12).(*array.ListBuilder), multiLines(shapeX, shapeY)) + appendPointListListList(b.Field(13).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) } type xyPoint [2]float64 @@ -1394,7 +1446,10 @@ func linePoints(x, y float64) []xyPoint { } func polygonRings(x, y float64) [][]xyPoint { - return [][]xyPoint{{{x, y}, {x, y + 1}, {x + 1, y + 1}, {x + 1, y}, {x, y}}} + return [][]xyPoint{ + {{x, y}, {x, y + 4}, {x + 4, y + 4}, {x + 4, y}, {x, y}}, + {{x + 1, y + 1}, {x + 2, y + 1}, {x + 2, y + 2}, {x + 1, y + 2}, {x + 1, y + 1}}, + } } func multiPoints(x, y float64) []xyPoint { @@ -1409,7 +1464,10 @@ func multiLines(x, y float64) [][]xyPoint { } func multiPolygons(x, y float64) [][][]xyPoint { - return [][][]xyPoint{polygonRings(x, y)} + return [][][]xyPoint{ + polygonRings(x, y), + {{{x + 10, y + 10}, {x + 10, y + 12}, {x + 12, y + 12}, {x + 12, y + 10}, {x + 10, y + 10}}}, + } } func lineWKT(x, y float64) string { @@ -1420,12 +1478,17 @@ func lineWKT(x, y float64) string { } func polygonGeoJSON(x, y float64) string { - return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, + return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]],[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, + coord(x), coord(y), + coord(x), coord(y+4), + coord(x+4), coord(y+4), + coord(x+4), coord(y), coord(x), coord(y), - coord(x), coord(y+1), coord(x+1), coord(y+1), - coord(x+1), coord(y), - coord(x), coord(y)) + coord(x+2), coord(y+1), + coord(x+2), coord(y+2), + coord(x+1), coord(y+2), + coord(x+1), coord(y+1)) } func coord(v float64) string { diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql index 9a288279..3c410e3f 100644 --- a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -9,6 +9,8 @@ type events @table(name: "events") { geom: Geometry @geometry_info(srid: 4326, type: POINT) geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_hugr_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_plain_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) geom_line: Geometry @geometry_info(srid: 4326, type: LINESTRING) geom_polygon_native: Geometry @geometry_info(srid: 4326, type: POLYGON) diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 0db34784..9f7aac8a 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -641,6 +641,8 @@ func geometryTypesSchema() *arrow.Schema { {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, + {Name: "geom_hugr_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "hugr.geojson"})}, + {Name: "geom_plain_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geojson"})}, {Name: "geom_wkb", Type: arrow.BinaryTypes.Binary, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkb"})}, {Name: "geom_line", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.linestring"})}, {Name: "geom_polygon_native", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.polygon"})}, @@ -653,7 +655,8 @@ func geometryTypesSchema() *arrow.Schema { func geometryTypesColumns() []string { return []string{ "name", "value", "is_active", - "geom", "geom_wkt", "geom_geojson", "geom_wkb", + "geom", "geom_wkt", "geom_geojson", + "geom_hugr_geojson", "geom_plain_geojson", "geom_wkb", "geom_line", "geom_polygon_native", "geom_multipoint", "geom_multiline", "geom_multipolygon", } @@ -663,16 +666,53 @@ func geometryExpected(point, x, y string) []string { return []string{ point, fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), - fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + polygonWKT(x, y), + polygonWKT(x, y), + polygonWKT(x, y), point, fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), - fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + polygonWKT(x, y), fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y), fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)), - fmt.Sprintf("MULTIPOLYGON(((%s %s,%s %s,%s %s,%s %s,%s %s)))", x, y, x, addCoord(y, 1), addCoord(x, 1), addCoord(y, 1), addCoord(x, 1), y, x, y), + multiPolygonWKT(x, y), } } +func polygonWKT(x, y string) string { + return fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s))", + x, y, + x, addCoord(y, 4), + addCoord(x, 4), addCoord(y, 4), + addCoord(x, 4), y, + x, y, + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 1), + ) +} + +func multiPolygonWKT(x, y string) string { + return fmt.Sprintf("MULTIPOLYGON(((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s)),((%s %s,%s %s,%s %s,%s %s,%s %s)))", + x, y, + x, addCoord(y, 4), + addCoord(x, 4), addCoord(y, 4), + addCoord(x, 4), y, + x, y, + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 10), addCoord(y, 10), + addCoord(x, 10), addCoord(y, 12), + addCoord(x, 12), addCoord(y, 12), + addCoord(x, 12), addCoord(y, 10), + addCoord(x, 10), addCoord(y, 10), + ) +} + func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, filter string, expected []map[string]any) { t.Helper() @@ -681,9 +721,11 @@ func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, events(%s, order_by: [{field: "name", direction: ASC}]) { name geom - geom_wkt - geom_geojson - geom_wkb + geom_wkt + geom_geojson + geom_hugr_geojson + geom_plain_geojson + geom_wkb geom_line geom_polygon_native geom_multipoint @@ -725,6 +767,8 @@ func geometryReadExpected(name string, point [2]float64, x, y float64) map[strin "geom": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), "geom_wkb": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), @@ -812,13 +856,15 @@ func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, b.Field(4).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) b.Field(5).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + b.Field(6).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + b.Field(7).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) - b.Field(6).(*array.BinaryBuilder).Append(wkbPoint) - appendPointList(b.Field(7).(*array.ListBuilder), linePoints(shapeX, shapeY)) - appendPointListList(b.Field(8).(*array.ListBuilder), polygonRings(shapeX, shapeY)) - appendPointList(b.Field(9).(*array.ListBuilder), multiPoints(shapeX, shapeY)) - appendPointListList(b.Field(10).(*array.ListBuilder), multiLines(shapeX, shapeY)) - appendPointListListList(b.Field(11).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) + b.Field(8).(*array.BinaryBuilder).Append(wkbPoint) + appendPointList(b.Field(9).(*array.ListBuilder), linePoints(shapeX, shapeY)) + appendPointListList(b.Field(10).(*array.ListBuilder), polygonRings(shapeX, shapeY)) + appendPointList(b.Field(11).(*array.ListBuilder), multiPoints(shapeX, shapeY)) + appendPointListList(b.Field(12).(*array.ListBuilder), multiLines(shapeX, shapeY)) + appendPointListListList(b.Field(13).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) } type xyPoint [2]float64 @@ -858,7 +904,10 @@ func linePoints(x, y float64) []xyPoint { } func polygonRings(x, y float64) [][]xyPoint { - return [][]xyPoint{{{x, y}, {x, y + 1}, {x + 1, y + 1}, {x + 1, y}, {x, y}}} + return [][]xyPoint{ + {{x, y}, {x, y + 4}, {x + 4, y + 4}, {x + 4, y}, {x, y}}, + {{x + 1, y + 1}, {x + 2, y + 1}, {x + 2, y + 2}, {x + 1, y + 2}, {x + 1, y + 1}}, + } } func multiPoints(x, y float64) []xyPoint { @@ -873,7 +922,10 @@ func multiLines(x, y float64) [][]xyPoint { } func multiPolygons(x, y float64) [][][]xyPoint { - return [][][]xyPoint{polygonRings(x, y)} + return [][][]xyPoint{ + polygonRings(x, y), + {{{x + 10, y + 10}, {x + 10, y + 12}, {x + 12, y + 12}, {x + 12, y + 10}, {x + 10, y + 10}}}, + } } func lineWKT(x, y float64) string { @@ -884,12 +936,17 @@ func lineWKT(x, y float64) string { } func polygonGeoJSON(x, y float64) string { - return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, + return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]],[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, + coord(x), coord(y), + coord(x), coord(y+4), + coord(x+4), coord(y+4), + coord(x+4), coord(y), coord(x), coord(y), - coord(x), coord(y+1), coord(x+1), coord(y+1), - coord(x+1), coord(y), - coord(x), coord(y)) + coord(x+2), coord(y+1), + coord(x+2), coord(y+2), + coord(x+1), coord(y+2), + coord(x+1), coord(y+1)) } func coord(v float64) string { @@ -1485,11 +1542,13 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { assert.ElementsMatch(t, geometryTypesColumns(), out.Columns) rows, err := env.pgConn.Query(` - SELECT name, - ST_AsText(geom), ST_SRID(geom), - ST_AsText(geom_wkt), ST_SRID(geom_wkt), - ST_AsText(geom_geojson), ST_SRID(geom_geojson), - ST_AsText(geom_wkb), ST_SRID(geom_wkb), + SELECT name, + ST_AsText(geom), ST_SRID(geom), + ST_AsText(geom_wkt), ST_SRID(geom_wkt), + ST_AsText(geom_geojson), ST_SRID(geom_geojson), + ST_AsText(geom_hugr_geojson), ST_SRID(geom_hugr_geojson), + ST_AsText(geom_plain_geojson), ST_SRID(geom_plain_geojson), + ST_AsText(geom_wkb), ST_SRID(geom_wkb), ST_AsText(geom_line), ST_SRID(geom_line), ST_AsText(geom_polygon_native), ST_SRID(geom_polygon_native), ST_AsText(geom_multipoint), ST_SRID(geom_multipoint), @@ -1506,8 +1565,8 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { gotSRID := map[string][]int{} for rows.Next() { var name string - values := make([]string, 9) - srids := make([]int, 9) + values := make([]string, 11) + srids := make([]int, 11) scanArgs := []any{&name} for i := range values { scanArgs = append(scanArgs, &values[i], &srids[i]) @@ -1525,8 +1584,8 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { "geo-b": geometryExpected("POINT(-73.935242 40.73061)", "1", "1"), }, got) assert.Equal(t, map[string][]int{ - "geo-a": []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, - "geo-b": []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, + "geo-a": []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, + "geo-b": []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, }, gotSRID) } @@ -1606,13 +1665,15 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'pg-geo-bulk-%'").Scan(&count)) assert.Equal(t, totalRows, count) - values := make([]string, 9) - srids := make([]int, 9) + values := make([]string, 11) + srids := make([]int, 11) require.NoError(t, env.pgConn.QueryRow(` - SELECT ST_AsText(geom), ST_SRID(geom), - ST_AsText(geom_wkt), ST_SRID(geom_wkt), - ST_AsText(geom_geojson), ST_SRID(geom_geojson), - ST_AsText(geom_wkb), ST_SRID(geom_wkb), + SELECT ST_AsText(geom), ST_SRID(geom), + ST_AsText(geom_wkt), ST_SRID(geom_wkt), + ST_AsText(geom_geojson), ST_SRID(geom_geojson), + ST_AsText(geom_hugr_geojson), ST_SRID(geom_hugr_geojson), + ST_AsText(geom_plain_geojson), ST_SRID(geom_plain_geojson), + ST_AsText(geom_wkb), ST_SRID(geom_wkb), ST_AsText(geom_line), ST_SRID(geom_line), ST_AsText(geom_polygon_native), ST_SRID(geom_polygon_native), ST_AsText(geom_multipoint), ST_SRID(geom_multipoint), @@ -1620,16 +1681,17 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { ST_AsText(geom_multipolygon), ST_SRID(geom_multipolygon) FROM events WHERE name = 'pg-geo-bulk-049999' - `).Scan( + `).Scan( &values[0], &srids[0], &values[1], &srids[1], &values[2], &srids[2], &values[3], &srids[3], &values[4], &srids[4], &values[5], &srids[5], &values[6], &srids[6], &values[7], &srids[7], &values[8], &srids[8], + &values[9], &srids[9], &values[10], &srids[10], )) for i := range values { values[i] = compactWKT(values[i]) } assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) - assert.Equal(t, []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, srids) + assert.Equal(t, []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, srids) assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { eq: "pg-geo-bulk-049999" } }`, []map[string]any{ geometryReadExpected("pg-geo-bulk-049999", [2]float64{99, 49}, 99, 49), }) diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index d7b3cf14..09add66e 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -16,6 +16,8 @@ CREATE TABLE events ( geom GEOMETRY(Point, 4326), geom_wkt GEOMETRY(LineString, 4326), geom_geojson GEOMETRY(Polygon, 4326), + geom_hugr_geojson GEOMETRY(Polygon, 4326), + geom_plain_geojson GEOMETRY(Polygon, 4326), geom_wkb GEOMETRY(Point, 4326), geom_line GEOMETRY(LineString, 4326), geom_polygon_native GEOMETRY(Polygon, 4326), diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index 9a288279..3c410e3f 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -9,6 +9,8 @@ type events @table(name: "events") { geom: Geometry @geometry_info(srid: 4326, type: POINT) geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_hugr_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_plain_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) geom_line: Geometry @geometry_info(srid: 4326, type: LINESTRING) geom_polygon_native: Geometry @geometry_info(srid: 4326, type: POLYGON) diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index 51a53808..ec9eef76 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -41,26 +41,63 @@ func duckDBArrowJSONExpr(arrowField arrow.Field, sourceExpr string) string { } func duckDBArrowGeometryExpr(arrowField arrow.Field, sourceExpr string) (string, error) { - wktExpr, err := duckDBArrowGeometryWKTExpr(arrowField, sourceExpr) - if err != nil { - return "", err - } - return "ST_GeomFromText(" + wktExpr + ", true)", nil -} - -func duckDBArrowGeometryWKTExpr(arrowField arrow.Field, sourceExpr string) (string, error) { if ext := arrowExtensionName(arrowField); ext != "" { - return duckDBArrowGeometryWKTExprFromTrustedExtension(ext, sourceExpr) + return duckDBArrowGeometryExprFromTrustedExtension(ext, sourceExpr) } - return duckDBArrowGeometryWKTExprFromPhysicalType(arrowField, sourceExpr) + return duckDBArrowGeometryExprFromPhysicalType(arrowField, sourceExpr) } -// duckDBArrowGeometryWKTExprFromTrustedExtension uses GeoArrow/Hugr extension +// duckDBArrowGeometryExprFromTrustedExtension uses GeoArrow/Hugr extension // metadata as the source of truth for geometry semantics. The physical Arrow // storage type is intentionally not used as a fallback once extension metadata // is present; unsupported metadata should fail during planning instead of being // guessed from Type.ID(). -func duckDBArrowGeometryWKTExprFromTrustedExtension(ext, sourceExpr string) (string, error) { +func duckDBArrowGeometryExprFromTrustedExtension(ext, sourceExpr string) (string, error) { + switch ext { + case "geoarrow.wkb": + return sourceExpr, nil + case "geoarrow.wkt": + return "ST_GeomFromText(" + sourceExpr + ", true)", nil + case "hugr.geojson", "geoarrow.geojson", "geojson": + return "ST_GeomFromGeoJSON(" + sourceExpr + ")", nil + case "geoarrow.linestring", "geoarrow.polygon", + "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", + "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": + return duckDBGeoArrowNativeGeometryExpr(ext, sourceExpr) + default: + return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) + } +} + +// duckDBArrowGeometryExprFromPhysicalType is the best-effort path for +// unannotated Arrow columns. Without extension metadata we infer common +// geometry encodings from physical Arrow storage. +func duckDBArrowGeometryExprFromPhysicalType(arrowField arrow.Field, sourceExpr string) (string, error) { + switch arrowField.Type.ID() { + case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: + return "ST_GeomFromWKB(" + sourceExpr + ")", nil + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: + return "CASE WHEN starts_with(trim(" + sourceExpr + "), '{') THEN ST_GeomFromGeoJSON(" + sourceExpr + ") ELSE ST_GeomFromText(" + sourceExpr + ", true) END", nil + case arrow.STRUCT, arrow.MAP: + return "ST_GeomFromGeoJSON(to_json(" + sourceExpr + ")::VARCHAR)", nil + default: + return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Geometry without geoarrow/hugr metadata", arrowField.Name, arrowField.Type) + } +} + +func duckDBArrowGeometryWKTWireExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + if ext := arrowExtensionName(arrowField); ext != "" { + return duckDBArrowGeometryWKTWireExprFromTrustedExtension(ext, sourceExpr) + } + return duckDBArrowGeometryWKTWireExprFromPhysicalType(arrowField, sourceExpr) +} + +// duckDBArrowGeometryWKTWireExprFromTrustedExtension builds a DuckDB expression +// returning WKT text for engines whose insert path cannot accept DuckDB +// GEOMETRY/WKB values directly. This is currently needed for Postgres attached +// tables: DuckDB's postgres extension COPY path accepts WKT/EWKT text for +// PostGIS geometry columns, but rejects WKB_BLOB/HEXWKB expressions. +func duckDBArrowGeometryWKTWireExprFromTrustedExtension(ext, sourceExpr string) (string, error) { switch ext { case "geoarrow.wkb": return "ST_AsText(" + sourceExpr + ")", nil @@ -77,10 +114,7 @@ func duckDBArrowGeometryWKTExprFromTrustedExtension(ext, sourceExpr string) (str } } -// duckDBArrowGeometryWKTExprFromPhysicalType is the best-effort path for -// unannotated Arrow columns. Without extension metadata we infer common -// geometry encodings from physical Arrow storage. -func duckDBArrowGeometryWKTExprFromPhysicalType(arrowField arrow.Field, sourceExpr string) (string, error) { +func duckDBArrowGeometryWKTWireExprFromPhysicalType(arrowField arrow.Field, sourceExpr string) (string, error) { switch arrowField.Type.ID() { case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: return "ST_AsText(ST_GeomFromWKB(" + sourceExpr + "))", nil @@ -110,10 +144,36 @@ func duckDBGeoArrowPointCoords(sql string) string { return "format('{} {}', struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" } +func duckDBGeoArrowPointGeometryExpr(sql string) string { + return "ST_Point(struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" +} + func duckDBGeoArrowPointWKT(sql string) string { return "'POINT (' || " + duckDBGeoArrowPointCoords(sql) + " || ')'" } +func duckDBGeoArrowLineStringGeometryExpr(sql string) string { + return "ST_MakeLine(list_transform(" + sql + ", lambda _p: " + duckDBGeoArrowPointGeometryExpr("_p") + "))" +} + +func duckDBGeoArrowPolygonGeometryExpr(sql string) string { + shell := duckDBGeoArrowLineStringGeometryExpr(sql + "[1]") + holes := "list_transform(" + sql + "[2:], lambda _r: " + duckDBGeoArrowLineStringGeometryExpr("_r") + ")" + return "ST_MakePolygon(" + shell + ", " + holes + ")" +} + +func duckDBGeoArrowMultiPointGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _p: " + duckDBGeoArrowPointGeometryExpr("_p") + ")))" +} + +func duckDBGeoArrowMultiLineStringGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _ls: " + duckDBGeoArrowLineStringGeometryExpr("_ls") + ")))" +} + +func duckDBGeoArrowMultiPolygonGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _poly: " + duckDBGeoArrowPolygonGeometryExpr("_poly") + ")))" +} + func duckDBGeoArrowPointListCoords(sql string) string { return "array_to_string(list_transform(" + sql + ", lambda _p: " + duckDBGeoArrowPointCoords("_p") + "), ', ')" } @@ -166,3 +226,24 @@ func duckDBGeoArrowNativeWKT(ext, sql string) (string, error) { return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) } } + +func duckDBGeoArrowNativeGeometryExpr(ext, sql string) (string, error) { + switch ext { + case "geoarrow.point": + return duckDBGeoArrowPointGeometryExpr(sql), nil + case "geoarrow.linestring": + return duckDBGeoArrowLineStringGeometryExpr(sql), nil + case "geoarrow.polygon": + return duckDBGeoArrowPolygonGeometryExpr(sql), nil + case "geoarrow.multipoint": + return duckDBGeoArrowMultiPointGeometryExpr(sql), nil + case "geoarrow.multilinestring": + return duckDBGeoArrowMultiLineStringGeometryExpr(sql), nil + case "geoarrow.multipolygon": + return duckDBGeoArrowMultiPolygonGeometryExpr(sql), nil + case "geoarrow.geometry", "geoarrow.geometrycollection": + return "", fmt.Errorf("%s ingest is not supported from native union storage; send geoarrow.wkb, geoarrow.wkt, geoarrow.geojson, or a concrete GeoArrow coordinate layout", ext) + default: + return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) + } +} diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go index d9e36baa..886ec534 100644 --- a/pkg/engines/arrow_ingest_test.go +++ b/pkg/engines/arrow_ingest_test.go @@ -17,12 +17,12 @@ func TestDuckDBArrowIngestBuildsNativeGeoArrowSelectExpr(t *testing.T) { ext string want string }{ - {"geoarrow.point", "POINT"}, - {"geoarrow.linestring", "LINESTRING"}, - {"geoarrow.polygon", "POLYGON"}, - {"geoarrow.multipoint", "MULTIPOINT"}, - {"geoarrow.multilinestring", "MULTILINESTRING"}, - {"geoarrow.multipolygon", "MULTIPOLYGON"}, + {"geoarrow.point", "ST_Point(struct_extract(geom, 'x'), struct_extract(geom, 'y'))"}, + {"geoarrow.linestring", "ST_MakeLine(list_transform(geom"}, + {"geoarrow.polygon", "ST_MakePolygon(ST_MakeLine(list_transform(geom[1]"}, + {"geoarrow.multipoint", "ST_Multi(ST_Collect(list_transform(geom"}, + {"geoarrow.multilinestring", "ST_Multi(ST_Collect(list_transform(geom"}, + {"geoarrow.multipolygon", "ST_Multi(ST_Collect(list_transform(geom"}, } for _, tt := range tests { @@ -38,14 +38,91 @@ func TestDuckDBArrowIngestBuildsNativeGeoArrowSelectExpr(t *testing.T) { if got == "geom" { t.Fatalf("expected explicit conversion, got raw column") } - if !strings.Contains(got, "ST_GeomFromText(") || !strings.Contains(got, tt.want) { + if !strings.Contains(got, tt.want) || + strings.Contains(got, "ST_GeomFromText(") || + strings.Contains(got, "ST_AsText(") { t.Fatalf("unexpected conversion for %s: %s", tt.ext, got) } }) } } -func TestPostgresArrowIngestBuildsNativeGeoArrowEWKTSelectExpr(t *testing.T) { +func TestDuckDBArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { + field := geometryTestField("") + + tests := []struct { + name string + typ arrow.DataType + ext string + want string + }{ + { + name: "trusted geoarrow wkb is already materialized as geometry", + typ: arrow.BinaryTypes.Binary, + ext: "geoarrow.wkb", + want: "geom", + }, + { + name: "trusted geoarrow wkt parses directly from text", + typ: arrow.BinaryTypes.String, + ext: "geoarrow.wkt", + want: "ST_GeomFromText(geom, true)", + }, + { + name: "trusted geoarrow geojson parses directly from json", + typ: arrow.BinaryTypes.String, + ext: "geoarrow.geojson", + want: "ST_GeomFromGeoJSON(geom)", + }, + { + name: "trusted hugr geojson parses directly from json", + typ: arrow.BinaryTypes.String, + ext: "hugr.geojson", + want: "ST_GeomFromGeoJSON(geom)", + }, + { + name: "trusted plain geojson parses directly from json", + typ: arrow.BinaryTypes.String, + ext: "geojson", + want: "ST_GeomFromGeoJSON(geom)", + }, + { + name: "unannotated binary parses directly as wkb", + typ: arrow.BinaryTypes.Binary, + want: "ST_GeomFromWKB(geom)", + }, + { + name: "unannotated string chooses geojson or wkt without text roundtrip", + typ: arrow.BinaryTypes.String, + want: "CASE WHEN starts_with(trim(geom), '{') THEN ST_GeomFromGeoJSON(geom) ELSE ST_GeomFromText(geom, true) END", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + meta := arrow.Metadata{} + if tt.ext != "" { + meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) + } + got, err := duckDBArrowIngestSelectExpr(field, arrow.Field{ + Name: "geom", + Type: tt.typ, + Metadata: meta, + }, "geom") + if err != nil { + t.Fatal(err) + } + if got != tt.want { + t.Fatalf("expected %s, got %s", tt.want, got) + } + if strings.Contains(got, "ST_AsText") { + t.Fatalf("expected direct geometry expression without ST_AsText, got %s", got) + } + }) + } +} + +func TestPostgresArrowIngestBuildsNativeGeoArrowWKTWireSelectExpr(t *testing.T) { field := geometryTestField("4326") tests := []struct { @@ -73,13 +150,85 @@ func TestPostgresArrowIngestBuildsNativeGeoArrowEWKTSelectExpr(t *testing.T) { if got == "geom" { t.Fatalf("expected explicit conversion, got raw column") } - if !strings.Contains(got, "'SRID=4326;' || ") || !strings.Contains(got, tt.want) { + if !strings.Contains(got, "'SRID=4326;' || ") || + !strings.Contains(got, tt.want) || + strings.Contains(got, "ST_AsHEXWKB(") { t.Fatalf("unexpected conversion for %s: %s", tt.ext, got) } }) } } +func TestPostgresArrowIngestBuildsWKTWireSelectExpr(t *testing.T) { + field := geometryTestField("4326") + + tests := []struct { + name string + typ arrow.DataType + ext string + want string + }{ + { + name: "trusted geoarrow wkb is converted from materialized geometry to wkt", + typ: arrow.BinaryTypes.Binary, + ext: "geoarrow.wkb", + want: "'SRID=4326;' || ST_AsText(geom)", + }, + { + name: "trusted geoarrow wkt is passed as wkt", + typ: arrow.BinaryTypes.String, + ext: "geoarrow.wkt", + want: "'SRID=4326;' || geom", + }, + { + name: "trusted geoarrow geojson parses to geometry then wkt", + typ: arrow.BinaryTypes.String, + ext: "geoarrow.geojson", + want: "'SRID=4326;' || ST_AsText(ST_GeomFromGeoJSON(geom))", + }, + { + name: "trusted hugr geojson parses to geometry then wkt", + typ: arrow.BinaryTypes.String, + ext: "hugr.geojson", + want: "'SRID=4326;' || ST_AsText(ST_GeomFromGeoJSON(geom))", + }, + { + name: "trusted plain geojson parses to geometry then wkt", + typ: arrow.BinaryTypes.String, + ext: "geojson", + want: "'SRID=4326;' || ST_AsText(ST_GeomFromGeoJSON(geom))", + }, + { + name: "unannotated binary parses as wkb then wkt", + typ: arrow.BinaryTypes.Binary, + want: "'SRID=4326;' || ST_AsText(ST_GeomFromWKB(geom))", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + meta := arrow.Metadata{} + if tt.ext != "" { + meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) + } + got, err := postgresArrowIngestSelectExpr(field, arrow.Field{ + Name: "geom", + Type: tt.typ, + Metadata: meta, + }, "geom") + if err != nil { + t.Fatal(err) + } + if got != tt.want { + t.Fatalf("expected %s, got %s", tt.want, got) + } + if strings.Contains(got, "ST_AsHEXWKB(") { + t.Fatalf("expected WKT wire expression without HEXWKB, got %s", got) + } + }) + } +} + func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { field := geometryTestField("") for _, ext := range []string{"geoarrow.geometry", "geoarrow.geometrycollection"} { @@ -145,8 +294,24 @@ func TestPostgresArrowIngestLiteralExprUsesDuckDBStagingLiterals(t *testing.T) { if err != nil { t.Fatal(err) } - if !strings.Contains(geomSQL, "'SRID=4326;' || ") || !strings.Contains(geomSQL, "POINT") { - t.Fatalf("expected Postgres EWKT literal, got %s", geomSQL) + if !strings.Contains(geomSQL, "'SRID=4326;'") || + !strings.Contains(geomSQL, "POINT") || + strings.Contains(geomSQL, "0101000000") { + t.Fatalf("expected Postgres WKT wire literal, got %s", geomSQL) + } +} + +func TestDuckDBArrowIngestLiteralExprUsesWKBStagingGeometry(t *testing.T) { + engine := &DuckDB{} + + geomSQL, err := engine.ArrowIngestLiteralExpr(geometryTestField(""), orb.Point{1, 2}) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(geomSQL, "ST_GeomFromWKB(from_hex('0101000000") || + strings.Contains(geomSQL, "ST_GeomFromText") || + strings.Contains(geomSQL, "POINT") { + t.Fatalf("expected DuckDB WKB geometry literal, got %s", geomSQL) } } diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index f12f3397..a2802d85 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -2,6 +2,7 @@ package engines import ( "context" + "encoding/hex" "encoding/json" "fmt" "strconv" @@ -102,7 +103,11 @@ func (e *DuckDB) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, er if geom == nil { return "NULL", nil } - return e.SQLValue(geom) + wkbValue, err := ctypes.GeometryToSQLValue(geom) + if err != nil { + return "", err + } + return "ST_GeomFromWKB(from_hex('" + strings.ToUpper(hex.EncodeToString(wkbValue)) + "'))", nil } return e.SQLValue(value) } diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index 37820786..cc6b9fc7 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -75,8 +75,12 @@ type EngineTypeCaster interface { type EngineArrowIngestCaster interface { Engine - // ArrowIngestSelectExpr returns a DuckDB-compatible SELECT expression for - // one Arrow-view column, shaped for insertion into this target engine. + // ArrowIngestSelectExpr maps one Arrow-view column to a DuckDB staging SELECT + // expression shaped for this target engine. + // Example: for a Geometry field, arrowField extension "geoarrow.geojson", and + // sourceExpr `geom_geojson`, DuckDB returns `ST_GeomFromGeoJSON(geom_geojson)`, + // while Postgres returns an EWKT text expression such as + // `'SRID=4326;' || ST_AsText(...)`. ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) // ArrowIngestLiteralExpr returns a DuckDB-compatible literal/expression for // non-Arrow values mixed into the ingest SELECT, shaped for this target. diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index f5cfede7..7e01fe0f 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -638,7 +638,7 @@ func postgresArrowGeometryWKTExpr(field *ast.Field, arrowField arrow.Field, sour if field != nil && field.Definition != nil { srid = base.FieldDefDirectiveArgString(field.Definition, base.FieldGeometryInfoDirectiveName, base.ArgSRID) } - wktExpr, err := duckDBArrowGeometryWKTExpr(arrowField, sourceExpr) + wktExpr, err := duckDBArrowGeometryWKTWireExpr(arrowField, sourceExpr) if err != nil { return "", err } diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 46df8992..2b87adfd 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -315,8 +315,8 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e field := ingestASTField(info, c.Field, c.FieldDef) // Build the Arrow ingest SELECT expression for the target // GraphQL/DB field. - // Examples: JSON to_json(...), Geometry ST_GeomFromText(...), - // or Postgres EWKT text for PostGIS columns. + // Examples: JSON to_json(...), DuckDB geometry expressions, + // or Postgres HEXWKB text for PostGIS columns. value, err := engine.ArrowIngestSelectExpr(field, c.ArrowField, value) if err != nil { return "", nil, err From 5bc2401494d86bfe9d852333437bead9fb09157e Mon Sep 17 00:00:00 2001 From: vadim Date: Thu, 18 Jun 2026 11:34:38 +0400 Subject: [PATCH 28/36] ipc ingest --- .../ingest-postgres/ingest_postgres_test.go | 78 ++++++--- .../ingest-postgres/testdata/init.sql | 24 +-- .../testdata/schemas/pg_ingest/schema.graphql | 24 +-- pkg/catalog/compiler/base/options.go | 2 + pkg/engines/arrow_ingest.go | 159 +++--------------- pkg/engines/arrow_ingest_test.go | 59 +++---- pkg/engines/duckdb.go | 4 +- pkg/engines/engines.go | 3 +- pkg/engines/postgres.go | 40 +---- 9 files changed, 145 insertions(+), 248 deletions(-) diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 9f7aac8a..6e0c4c30 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -414,8 +414,8 @@ func TestIngest_Postgres_PermissionDataGeometry(t *testing.T) { "perm-geom-beta": "POINT(7.25 8.5)", }, got) assert.Equal(t, map[string]int{ - "perm-geom-alpha": 4326, - "perm-geom-beta": 4326, + "perm-geom-alpha": 0, + "perm-geom-beta": 0, }, gotSRID) } @@ -639,7 +639,9 @@ func geometryTypesSchema() *arrow.Schema { {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, + {Name: "geom_4326", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, + {Name: "geom_wkt_4326", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, {Name: "geom_hugr_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "hugr.geojson"})}, {Name: "geom_plain_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geojson"})}, @@ -655,7 +657,7 @@ func geometryTypesSchema() *arrow.Schema { func geometryTypesColumns() []string { return []string{ "name", "value", "is_active", - "geom", "geom_wkt", "geom_geojson", + "geom", "geom_4326", "geom_wkt", "geom_wkt_4326", "geom_geojson", "geom_hugr_geojson", "geom_plain_geojson", "geom_wkb", "geom_line", "geom_polygon_native", "geom_multipoint", "geom_multiline", "geom_multipolygon", @@ -663,14 +665,17 @@ func geometryTypesColumns() []string { } func geometryExpected(point, x, y string) []string { + line := fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)) return []string{ point, - fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), + point, + line, + line, polygonWKT(x, y), polygonWKT(x, y), polygonWKT(x, y), point, - fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), + line, polygonWKT(x, y), fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y), fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)), @@ -678,6 +683,10 @@ func geometryExpected(point, x, y string) []string { } } +func geometrySRIDExpected() []int { + return []int{0, 4326, 0, 4326, 0, 0, 0, 0, 0, 0, 0, 0, 0} +} + func polygonWKT(x, y string) string { return fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s))", x, y, @@ -721,11 +730,13 @@ func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, events(%s, order_by: [{field: "name", direction: ASC}]) { name geom - geom_wkt - geom_geojson - geom_hugr_geojson - geom_plain_geojson - geom_wkb + geom_4326 + geom_wkt + geom_wkt_4326 + geom_geojson + geom_hugr_geojson + geom_plain_geojson + geom_wkb geom_line geom_polygon_native geom_multipoint @@ -765,7 +776,9 @@ func geometryReadExpected(name string, point [2]float64, x, y float64) map[strin return map[string]any{ "name": name, "geom": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), + "geom_4326": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_wkt_4326": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), @@ -854,17 +867,23 @@ func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) - b.Field(4).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) - b.Field(5).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) - b.Field(6).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + sb = b.Field(4).(*array.StructBuilder) + sb.Append(true) + sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) + sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + + b.Field(5).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) + b.Field(6).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) b.Field(7).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + b.Field(8).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) + b.Field(9).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) - b.Field(8).(*array.BinaryBuilder).Append(wkbPoint) - appendPointList(b.Field(9).(*array.ListBuilder), linePoints(shapeX, shapeY)) - appendPointListList(b.Field(10).(*array.ListBuilder), polygonRings(shapeX, shapeY)) - appendPointList(b.Field(11).(*array.ListBuilder), multiPoints(shapeX, shapeY)) - appendPointListList(b.Field(12).(*array.ListBuilder), multiLines(shapeX, shapeY)) - appendPointListListList(b.Field(13).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) + b.Field(10).(*array.BinaryBuilder).Append(wkbPoint) + appendPointList(b.Field(11).(*array.ListBuilder), linePoints(shapeX, shapeY)) + appendPointListList(b.Field(12).(*array.ListBuilder), polygonRings(shapeX, shapeY)) + appendPointList(b.Field(13).(*array.ListBuilder), multiPoints(shapeX, shapeY)) + appendPointListList(b.Field(14).(*array.ListBuilder), multiLines(shapeX, shapeY)) + appendPointListListList(b.Field(15).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) } type xyPoint [2]float64 @@ -1544,7 +1563,9 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { rows, err := env.pgConn.Query(` SELECT name, ST_AsText(geom), ST_SRID(geom), + ST_AsText(geom_4326), ST_SRID(geom_4326), ST_AsText(geom_wkt), ST_SRID(geom_wkt), + ST_AsText(geom_wkt_4326), ST_SRID(geom_wkt_4326), ST_AsText(geom_geojson), ST_SRID(geom_geojson), ST_AsText(geom_hugr_geojson), ST_SRID(geom_hugr_geojson), ST_AsText(geom_plain_geojson), ST_SRID(geom_plain_geojson), @@ -1565,8 +1586,8 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { gotSRID := map[string][]int{} for rows.Next() { var name string - values := make([]string, 11) - srids := make([]int, 11) + values := make([]string, 13) + srids := make([]int, 13) scanArgs := []any{&name} for i := range values { scanArgs = append(scanArgs, &values[i], &srids[i]) @@ -1584,8 +1605,8 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { "geo-b": geometryExpected("POINT(-73.935242 40.73061)", "1", "1"), }, got) assert.Equal(t, map[string][]int{ - "geo-a": []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, - "geo-b": []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, + "geo-a": geometrySRIDExpected(), + "geo-b": geometrySRIDExpected(), }, gotSRID) } @@ -1665,11 +1686,13 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'pg-geo-bulk-%'").Scan(&count)) assert.Equal(t, totalRows, count) - values := make([]string, 11) - srids := make([]int, 11) + values := make([]string, 13) + srids := make([]int, 13) require.NoError(t, env.pgConn.QueryRow(` SELECT ST_AsText(geom), ST_SRID(geom), + ST_AsText(geom_4326), ST_SRID(geom_4326), ST_AsText(geom_wkt), ST_SRID(geom_wkt), + ST_AsText(geom_wkt_4326), ST_SRID(geom_wkt_4326), ST_AsText(geom_geojson), ST_SRID(geom_geojson), ST_AsText(geom_hugr_geojson), ST_SRID(geom_hugr_geojson), ST_AsText(geom_plain_geojson), ST_SRID(geom_plain_geojson), @@ -1685,13 +1708,14 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { &values[0], &srids[0], &values[1], &srids[1], &values[2], &srids[2], &values[3], &srids[3], &values[4], &srids[4], &values[5], &srids[5], &values[6], &srids[6], &values[7], &srids[7], &values[8], &srids[8], - &values[9], &srids[9], &values[10], &srids[10], + &values[9], &srids[9], &values[10], &srids[10], &values[11], &srids[11], + &values[12], &srids[12], )) for i := range values { values[i] = compactWKT(values[i]) } assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) - assert.Equal(t, []int{4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326, 4326}, srids) + assert.Equal(t, geometrySRIDExpected(), srids) assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { eq: "pg-geo-bulk-049999" } }`, []map[string]any{ geometryReadExpected("pg-geo-bulk-049999", [2]float64{99, 49}, 99, 49), }) diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index 09add66e..0110f95b 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -13,15 +13,17 @@ CREATE TABLE events ( owner_id BIGINT, payload JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - geom GEOMETRY(Point, 4326), - geom_wkt GEOMETRY(LineString, 4326), - geom_geojson GEOMETRY(Polygon, 4326), - geom_hugr_geojson GEOMETRY(Polygon, 4326), - geom_plain_geojson GEOMETRY(Polygon, 4326), - geom_wkb GEOMETRY(Point, 4326), - geom_line GEOMETRY(LineString, 4326), - geom_polygon_native GEOMETRY(Polygon, 4326), - geom_multipoint GEOMETRY(MultiPoint, 4326), - geom_multiline GEOMETRY(MultiLineString, 4326), - geom_multipolygon GEOMETRY(MultiPolygon, 4326) + geom GEOMETRY(Point, 0), + geom_4326 GEOMETRY(Point, 4326), + geom_wkt GEOMETRY(LineString, 0), + geom_wkt_4326 GEOMETRY(LineString, 4326), + geom_geojson GEOMETRY(Polygon, 0), + geom_hugr_geojson GEOMETRY(Polygon, 0), + geom_plain_geojson GEOMETRY(Polygon, 0), + geom_wkb GEOMETRY(Point, 0), + geom_line GEOMETRY(LineString, 0), + geom_polygon_native GEOMETRY(Polygon, 0), + geom_multipoint GEOMETRY(MultiPoint, 0), + geom_multiline GEOMETRY(MultiLineString, 0), + geom_multipolygon GEOMETRY(MultiPolygon, 0) ); diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index 3c410e3f..dfe9807f 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -6,15 +6,17 @@ type events @table(name: "events") { owner_id: BigInt payload: JSON created_at: Timestamp @default(value: "now()") - geom: Geometry @geometry_info(srid: 4326, type: POINT) - geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) - geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) - geom_hugr_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) - geom_plain_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) - geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) - geom_line: Geometry @geometry_info(srid: 4326, type: LINESTRING) - geom_polygon_native: Geometry @geometry_info(srid: 4326, type: POLYGON) - geom_multipoint: Geometry @geometry_info(srid: 4326, type: MULTIPOINT) - geom_multiline: Geometry @geometry_info(srid: 4326, type: MULTILINESTRING) - geom_multipolygon: Geometry @geometry_info(srid: 4326, type: MULTIPOLYGON) + geom: Geometry @geometry_info(srid: 0, type: POINT) + geom_4326: Geometry @geometry_info(srid: 4326, type: POINT) + geom_wkt: Geometry @geometry_info(srid: 0, type: LINESTRING) + geom_wkt_4326: Geometry @geometry_info(srid: 4326, type: LINESTRING) + geom_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_hugr_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_plain_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_wkb: Geometry @geometry_info(srid: 0, type: POINT) + geom_line: Geometry @geometry_info(srid: 0, type: LINESTRING) + geom_polygon_native: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_multipoint: Geometry @geometry_info(srid: 0, type: MULTIPOINT) + geom_multiline: Geometry @geometry_info(srid: 0, type: MULTILINESTRING) + geom_multipolygon: Geometry @geometry_info(srid: 0, type: MULTIPOLYGON) } diff --git a/pkg/catalog/compiler/base/options.go b/pkg/catalog/compiler/base/options.go index b0598537..4f826d54 100644 --- a/pkg/catalog/compiler/base/options.go +++ b/pkg/catalog/compiler/base/options.go @@ -87,6 +87,8 @@ type EngineCapabilities struct { Insert EngineInsertCapabilities Update EngineUpdateCapabilities Delete EngineDeleteCapabilities + Ingest EngineDeleteCapabilities + // options: only insert / merge ... } type EngineInsertCapabilities struct { diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index ec9eef76..de68e53b 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -27,7 +27,7 @@ func (b *ArrowIngestStagingBuilder) FunctionCall(name string, positional []any, return b.duckdb.FunctionCall(name, positional, named) } -func duckDBArrowJSONExpr(arrowField arrow.Field, sourceExpr string) string { +func arrowIngestJSONStagingExpr(arrowField arrow.Field, sourceExpr string) string { switch arrowField.Type.ID() { case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW, arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: @@ -40,19 +40,19 @@ func duckDBArrowJSONExpr(arrowField arrow.Field, sourceExpr string) string { } } -func duckDBArrowGeometryExpr(arrowField arrow.Field, sourceExpr string) (string, error) { +func arrowIngestGeometryStagingExpr(arrowField arrow.Field, sourceExpr string) (string, error) { if ext := arrowExtensionName(arrowField); ext != "" { - return duckDBArrowGeometryExprFromTrustedExtension(ext, sourceExpr) + return arrowIngestGeometryStagingExprFromTrustedExtension(ext, sourceExpr) } - return duckDBArrowGeometryExprFromPhysicalType(arrowField, sourceExpr) + return arrowIngestGeometryStagingExprFromPhysicalType(arrowField, sourceExpr) } -// duckDBArrowGeometryExprFromTrustedExtension uses GeoArrow/Hugr extension +// arrowIngestGeometryStagingExprFromTrustedExtension uses GeoArrow/Hugr extension // metadata as the source of truth for geometry semantics. The physical Arrow // storage type is intentionally not used as a fallback once extension metadata // is present; unsupported metadata should fail during planning instead of being // guessed from Type.ID(). -func duckDBArrowGeometryExprFromTrustedExtension(ext, sourceExpr string) (string, error) { +func arrowIngestGeometryStagingExprFromTrustedExtension(ext, sourceExpr string) (string, error) { switch ext { case "geoarrow.wkb": return sourceExpr, nil @@ -63,16 +63,16 @@ func duckDBArrowGeometryExprFromTrustedExtension(ext, sourceExpr string) (string case "geoarrow.linestring", "geoarrow.polygon", "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": - return duckDBGeoArrowNativeGeometryExpr(ext, sourceExpr) + return arrowIngestGeoArrowNativeGeometryStagingExpr(ext, sourceExpr) default: return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) } } -// duckDBArrowGeometryExprFromPhysicalType is the best-effort path for +// arrowIngestGeometryStagingExprFromPhysicalType is the best-effort path for // unannotated Arrow columns. Without extension metadata we infer common // geometry encodings from physical Arrow storage. -func duckDBArrowGeometryExprFromPhysicalType(arrowField arrow.Field, sourceExpr string) (string, error) { +func arrowIngestGeometryStagingExprFromPhysicalType(arrowField arrow.Field, sourceExpr string) (string, error) { switch arrowField.Type.ID() { case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: return "ST_GeomFromWKB(" + sourceExpr + ")", nil @@ -85,48 +85,6 @@ func duckDBArrowGeometryExprFromPhysicalType(arrowField arrow.Field, sourceExpr } } -func duckDBArrowGeometryWKTWireExpr(arrowField arrow.Field, sourceExpr string) (string, error) { - if ext := arrowExtensionName(arrowField); ext != "" { - return duckDBArrowGeometryWKTWireExprFromTrustedExtension(ext, sourceExpr) - } - return duckDBArrowGeometryWKTWireExprFromPhysicalType(arrowField, sourceExpr) -} - -// duckDBArrowGeometryWKTWireExprFromTrustedExtension builds a DuckDB expression -// returning WKT text for engines whose insert path cannot accept DuckDB -// GEOMETRY/WKB values directly. This is currently needed for Postgres attached -// tables: DuckDB's postgres extension COPY path accepts WKT/EWKT text for -// PostGIS geometry columns, but rejects WKB_BLOB/HEXWKB expressions. -func duckDBArrowGeometryWKTWireExprFromTrustedExtension(ext, sourceExpr string) (string, error) { - switch ext { - case "geoarrow.wkb": - return "ST_AsText(" + sourceExpr + ")", nil - case "geoarrow.wkt": - return sourceExpr, nil - case "hugr.geojson", "geoarrow.geojson", "geojson": - return "ST_AsText(ST_GeomFromGeoJSON(" + sourceExpr + "))", nil - case "geoarrow.linestring", "geoarrow.polygon", - "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", - "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": - return duckDBGeoArrowNativeWKT(ext, sourceExpr) - default: - return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) - } -} - -func duckDBArrowGeometryWKTWireExprFromPhysicalType(arrowField arrow.Field, sourceExpr string) (string, error) { - switch arrowField.Type.ID() { - case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: - return "ST_AsText(ST_GeomFromWKB(" + sourceExpr + "))", nil - case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: - return "CASE WHEN starts_with(trim(" + sourceExpr + "), '{') THEN ST_AsText(ST_GeomFromGeoJSON(" + sourceExpr + ")) ELSE " + sourceExpr + " END", nil - case arrow.STRUCT, arrow.MAP: - return "ST_AsText(ST_GeomFromGeoJSON(to_json(" + sourceExpr + ")::VARCHAR))", nil - default: - return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Geometry without geoarrow/hugr metadata", arrowField.Name, arrowField.Type) - } -} - func arrowExtensionName(field arrow.Field) string { if extType, ok := field.Type.(arrow.ExtensionType); ok { return strings.ToLower(extType.ExtensionName()) @@ -140,107 +98,46 @@ func arrowExtensionName(field arrow.Field) string { return "" } -func duckDBGeoArrowPointCoords(sql string) string { - return "format('{} {}', struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" -} - -func duckDBGeoArrowPointGeometryExpr(sql string) string { +func arrowIngestGeoArrowPointGeometryStagingExpr(sql string) string { return "ST_Point(struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" } -func duckDBGeoArrowPointWKT(sql string) string { - return "'POINT (' || " + duckDBGeoArrowPointCoords(sql) + " || ')'" -} - -func duckDBGeoArrowLineStringGeometryExpr(sql string) string { - return "ST_MakeLine(list_transform(" + sql + ", lambda _p: " + duckDBGeoArrowPointGeometryExpr("_p") + "))" +func arrowIngestGeoArrowLineStringGeometryStagingExpr(sql string) string { + return "ST_MakeLine(list_transform(" + sql + ", lambda _p: " + arrowIngestGeoArrowPointGeometryStagingExpr("_p") + "))" } -func duckDBGeoArrowPolygonGeometryExpr(sql string) string { - shell := duckDBGeoArrowLineStringGeometryExpr(sql + "[1]") - holes := "list_transform(" + sql + "[2:], lambda _r: " + duckDBGeoArrowLineStringGeometryExpr("_r") + ")" +func arrowIngestGeoArrowPolygonGeometryStagingExpr(sql string) string { + shell := arrowIngestGeoArrowLineStringGeometryStagingExpr(sql + "[1]") + holes := "list_transform(" + sql + "[2:], lambda _r: " + arrowIngestGeoArrowLineStringGeometryStagingExpr("_r") + ")" return "ST_MakePolygon(" + shell + ", " + holes + ")" } -func duckDBGeoArrowMultiPointGeometryExpr(sql string) string { - return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _p: " + duckDBGeoArrowPointGeometryExpr("_p") + ")))" -} - -func duckDBGeoArrowMultiLineStringGeometryExpr(sql string) string { - return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _ls: " + duckDBGeoArrowLineStringGeometryExpr("_ls") + ")))" -} - -func duckDBGeoArrowMultiPolygonGeometryExpr(sql string) string { - return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _poly: " + duckDBGeoArrowPolygonGeometryExpr("_poly") + ")))" -} - -func duckDBGeoArrowPointListCoords(sql string) string { - return "array_to_string(list_transform(" + sql + ", lambda _p: " + duckDBGeoArrowPointCoords("_p") + "), ', ')" -} - -func duckDBGeoArrowLineStringWKT(sql string) string { - return "'LINESTRING (' || " + duckDBGeoArrowPointListCoords(sql) + " || ')'" -} - -func duckDBGeoArrowRingWKT(sql string) string { - return "'(' || " + duckDBGeoArrowPointListCoords(sql) + " || ')'" -} - -func duckDBGeoArrowPolygonWKT(sql string) string { - return "'POLYGON (' || array_to_string(list_transform(" + sql + ", lambda _r: " + - duckDBGeoArrowRingWKT("_r") + "), ', ') || ')'" -} - -func duckDBGeoArrowMultiPointWKT(sql string) string { - return "'MULTIPOINT (' || " + duckDBGeoArrowPointListCoords(sql) + " || ')'" +func arrowIngestGeoArrowMultiPointGeometryStagingExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _p: " + arrowIngestGeoArrowPointGeometryStagingExpr("_p") + ")))" } -func duckDBGeoArrowMultiLineStringWKT(sql string) string { - return "'MULTILINESTRING (' || array_to_string(list_transform(" + sql + ", lambda _ls: " + - duckDBGeoArrowRingWKT("_ls") + "), ', ') || ')'" +func arrowIngestGeoArrowMultiLineStringGeometryStagingExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _ls: " + arrowIngestGeoArrowLineStringGeometryStagingExpr("_ls") + ")))" } -func duckDBGeoArrowMultiPolygonWKT(sql string) string { - return "'MULTIPOLYGON (' || array_to_string(list_transform(" + sql + ", lambda _poly: '(' || " + - "array_to_string(list_transform(_poly, lambda _r: " + duckDBGeoArrowRingWKT("_r") + - "), ', ') || ')'), ', ') || ')'" -} - -func duckDBGeoArrowNativeWKT(ext, sql string) (string, error) { - switch ext { - case "geoarrow.point": - return duckDBGeoArrowPointWKT(sql), nil - case "geoarrow.linestring": - return duckDBGeoArrowLineStringWKT(sql), nil - case "geoarrow.polygon": - return duckDBGeoArrowPolygonWKT(sql), nil - case "geoarrow.multipoint": - return duckDBGeoArrowMultiPointWKT(sql), nil - case "geoarrow.multilinestring": - return duckDBGeoArrowMultiLineStringWKT(sql), nil - case "geoarrow.multipolygon": - return duckDBGeoArrowMultiPolygonWKT(sql), nil - case "geoarrow.geometry", "geoarrow.geometrycollection": - return "", fmt.Errorf("%s ingest is not supported from native union storage; send geoarrow.wkb, geoarrow.wkt, geoarrow.geojson, or a concrete GeoArrow coordinate layout", ext) - default: - return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) - } +func arrowIngestGeoArrowMultiPolygonGeometryStagingExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _poly: " + arrowIngestGeoArrowPolygonGeometryStagingExpr("_poly") + ")))" } -func duckDBGeoArrowNativeGeometryExpr(ext, sql string) (string, error) { +func arrowIngestGeoArrowNativeGeometryStagingExpr(ext, sql string) (string, error) { switch ext { case "geoarrow.point": - return duckDBGeoArrowPointGeometryExpr(sql), nil + return arrowIngestGeoArrowPointGeometryStagingExpr(sql), nil case "geoarrow.linestring": - return duckDBGeoArrowLineStringGeometryExpr(sql), nil + return arrowIngestGeoArrowLineStringGeometryStagingExpr(sql), nil case "geoarrow.polygon": - return duckDBGeoArrowPolygonGeometryExpr(sql), nil + return arrowIngestGeoArrowPolygonGeometryStagingExpr(sql), nil case "geoarrow.multipoint": - return duckDBGeoArrowMultiPointGeometryExpr(sql), nil + return arrowIngestGeoArrowMultiPointGeometryStagingExpr(sql), nil case "geoarrow.multilinestring": - return duckDBGeoArrowMultiLineStringGeometryExpr(sql), nil + return arrowIngestGeoArrowMultiLineStringGeometryStagingExpr(sql), nil case "geoarrow.multipolygon": - return duckDBGeoArrowMultiPolygonGeometryExpr(sql), nil + return arrowIngestGeoArrowMultiPolygonGeometryStagingExpr(sql), nil case "geoarrow.geometry", "geoarrow.geometrycollection": return "", fmt.Errorf("%s ingest is not supported from native union storage; send geoarrow.wkb, geoarrow.wkt, geoarrow.geojson, or a concrete GeoArrow coordinate layout", ext) default: diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go index 886ec534..0abd2e64 100644 --- a/pkg/engines/arrow_ingest_test.go +++ b/pkg/engines/arrow_ingest_test.go @@ -122,19 +122,19 @@ func TestDuckDBArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { } } -func TestPostgresArrowIngestBuildsNativeGeoArrowWKTWireSelectExpr(t *testing.T) { +func TestPostgresArrowIngestBuildsNativeGeoArrowDirectSelectExpr(t *testing.T) { field := geometryTestField("4326") tests := []struct { ext string want string }{ - {"geoarrow.point", "POINT"}, - {"geoarrow.linestring", "LINESTRING"}, - {"geoarrow.polygon", "POLYGON"}, - {"geoarrow.multipoint", "MULTIPOINT"}, - {"geoarrow.multilinestring", "MULTILINESTRING"}, - {"geoarrow.multipolygon", "MULTIPOLYGON"}, + {"geoarrow.point", "ST_Point(struct_extract(geom, 'x'), struct_extract(geom, 'y'))"}, + {"geoarrow.linestring", "ST_MakeLine(list_transform(geom"}, + {"geoarrow.polygon", "ST_MakePolygon(ST_MakeLine(list_transform(geom[1]"}, + {"geoarrow.multipoint", "ST_Multi(ST_Collect(list_transform(geom"}, + {"geoarrow.multilinestring", "ST_Multi(ST_Collect(list_transform(geom"}, + {"geoarrow.multipolygon", "ST_Multi(ST_Collect(list_transform(geom"}, } for _, tt := range tests { @@ -150,16 +150,16 @@ func TestPostgresArrowIngestBuildsNativeGeoArrowWKTWireSelectExpr(t *testing.T) if got == "geom" { t.Fatalf("expected explicit conversion, got raw column") } - if !strings.Contains(got, "'SRID=4326;' || ") || - !strings.Contains(got, tt.want) || - strings.Contains(got, "ST_AsHEXWKB(") { + if !strings.Contains(got, tt.want) || + strings.Contains(got, "'SRID=4326;'") || + strings.Contains(got, "ST_AsText(") { t.Fatalf("unexpected conversion for %s: %s", tt.ext, got) } }) } } -func TestPostgresArrowIngestBuildsWKTWireSelectExpr(t *testing.T) { +func TestPostgresArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { field := geometryTestField("4326") tests := []struct { @@ -169,39 +169,39 @@ func TestPostgresArrowIngestBuildsWKTWireSelectExpr(t *testing.T) { want string }{ { - name: "trusted geoarrow wkb is converted from materialized geometry to wkt", + name: "trusted geoarrow wkb is already materialized as geometry", typ: arrow.BinaryTypes.Binary, ext: "geoarrow.wkb", - want: "'SRID=4326;' || ST_AsText(geom)", + want: "geom", }, { - name: "trusted geoarrow wkt is passed as wkt", + name: "trusted geoarrow wkt parses directly from text", typ: arrow.BinaryTypes.String, ext: "geoarrow.wkt", - want: "'SRID=4326;' || geom", + want: "ST_GeomFromText(geom, true)", }, { - name: "trusted geoarrow geojson parses to geometry then wkt", + name: "trusted geoarrow geojson parses directly from json", typ: arrow.BinaryTypes.String, ext: "geoarrow.geojson", - want: "'SRID=4326;' || ST_AsText(ST_GeomFromGeoJSON(geom))", + want: "ST_GeomFromGeoJSON(geom)", }, { - name: "trusted hugr geojson parses to geometry then wkt", + name: "trusted hugr geojson parses directly from json", typ: arrow.BinaryTypes.String, ext: "hugr.geojson", - want: "'SRID=4326;' || ST_AsText(ST_GeomFromGeoJSON(geom))", + want: "ST_GeomFromGeoJSON(geom)", }, { - name: "trusted plain geojson parses to geometry then wkt", + name: "trusted plain geojson parses directly from json", typ: arrow.BinaryTypes.String, ext: "geojson", - want: "'SRID=4326;' || ST_AsText(ST_GeomFromGeoJSON(geom))", + want: "ST_GeomFromGeoJSON(geom)", }, { - name: "unannotated binary parses as wkb then wkt", + name: "unannotated binary parses directly as wkb", typ: arrow.BinaryTypes.Binary, - want: "'SRID=4326;' || ST_AsText(ST_GeomFromWKB(geom))", + want: "ST_GeomFromWKB(geom)", }, } @@ -222,8 +222,8 @@ func TestPostgresArrowIngestBuildsWKTWireSelectExpr(t *testing.T) { if got != tt.want { t.Fatalf("expected %s, got %s", tt.want, got) } - if strings.Contains(got, "ST_AsHEXWKB(") { - t.Fatalf("expected WKT wire expression without HEXWKB, got %s", got) + if strings.Contains(got, "'SRID=4326;'") || strings.Contains(got, "ST_AsText(") { + t.Fatalf("expected direct geometry expression, got %s", got) } }) } @@ -294,10 +294,11 @@ func TestPostgresArrowIngestLiteralExprUsesDuckDBStagingLiterals(t *testing.T) { if err != nil { t.Fatal(err) } - if !strings.Contains(geomSQL, "'SRID=4326;'") || - !strings.Contains(geomSQL, "POINT") || - strings.Contains(geomSQL, "0101000000") { - t.Fatalf("expected Postgres WKT wire literal, got %s", geomSQL) + if !strings.Contains(geomSQL, "ST_GeomFromWKB(from_hex('0101000000") || + strings.Contains(geomSQL, "'SRID=4326;'") || + strings.Contains(geomSQL, "ST_GeomFromText") || + strings.Contains(geomSQL, "POINT") { + t.Fatalf("expected Postgres WKB geometry literal, got %s", geomSQL) } } diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index a2802d85..6fb9d3c2 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -118,9 +118,9 @@ func duckDBArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourc } switch field.Definition.Type.Name() { case base.JSONTypeName: - return duckDBArrowJSONExpr(arrowField, sourceExpr), nil + return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil case base.GeometryTypeName: - return duckDBArrowGeometryExpr(arrowField, sourceExpr) + return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) default: return sourceExpr, nil } diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index cc6b9fc7..4bfa6fc1 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -79,8 +79,7 @@ type EngineArrowIngestCaster interface { // expression shaped for this target engine. // Example: for a Geometry field, arrowField extension "geoarrow.geojson", and // sourceExpr `geom_geojson`, DuckDB returns `ST_GeomFromGeoJSON(geom_geojson)`, - // while Postgres returns an EWKT text expression such as - // `'SRID=4326;' || ST_AsText(...)`. + // and Postgres returns the same DuckDB staging geometry expression. ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) // ArrowIngestLiteralExpr returns a DuckDB-compatible literal/expression for // non-Arrow values mixed into the ingest SELECT, shaped for this target. diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 7e01fe0f..8d01146d 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -600,23 +600,8 @@ func (e *Postgres) ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Fiel } func (e *Postgres) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) { - if value == nil { - return "NULL", nil - } - if field != nil && field.Definition != nil && field.Definition.Type.Name() == base.GeometryTypeName { - geom, err := ctypes.ParseGeometryValue(value) - if err != nil { - return "", err - } - if geom == nil { - return "NULL", nil - } - srid := base.FieldDefDirectiveArgString(field.Definition, base.FieldGeometryInfoDirectiveName, base.ArgSRID) - wktValue := strings.ReplaceAll(string(wkt.Marshal(geom)), "'", "''") - return postgresWKTText("'"+wktValue+"'", srid), nil - } var duckdb DuckDB - return duckdb.SQLValue(value) + return duckdb.ArrowIngestLiteralExpr(field, value) } func postgresArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { @@ -625,31 +610,16 @@ func postgresArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sou } switch field.Definition.Type.Name() { case base.JSONTypeName: - return duckDBArrowJSONExpr(arrowField, sourceExpr), nil + return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil case base.GeometryTypeName: - return postgresArrowGeometryWKTExpr(field, arrowField, sourceExpr) + return postgresArrowGeometryExpr(arrowField, sourceExpr) default: return sourceExpr, nil } } -func postgresArrowGeometryWKTExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { - srid := "" - if field != nil && field.Definition != nil { - srid = base.FieldDefDirectiveArgString(field.Definition, base.FieldGeometryInfoDirectiveName, base.ArgSRID) - } - wktExpr, err := duckDBArrowGeometryWKTWireExpr(arrowField, sourceExpr) - if err != nil { - return "", err - } - return postgresWKTText(wktExpr, srid), nil -} - -func postgresWKTText(sql, srid string) string { - if srid == "" || srid == "0" { - return sql - } - return "'SRID=" + srid + ";' || " + sql +func postgresArrowGeometryExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) } func pgRangeValueToSQLValue(v any) (string, error) { From ab690ad0d48eeda1e330fee9e2b93ce268f04ce3 Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 20 Jun 2026 11:15:19 +0400 Subject: [PATCH 29/36] ipc ingest --- pkg/catalog/compiler/base/options.go | 22 +++++++++++-- pkg/catalog/compiler/base/options_test.go | 28 ++++++++++++++++ pkg/catalog/compiler/options.go | 3 ++ pkg/engines/airport.go | 2 +- pkg/engines/duckdb.go | 4 ++- pkg/engines/ducklake.go | 2 +- pkg/engines/engines.go | 17 +++++++--- pkg/engines/iceberg.go | 2 +- pkg/engines/postgres.go | 4 ++- pkg/planner/node_arrow_ingest.go | 40 +++++++++++++++++++---- pkg/planner/node_arrow_ingest_test.go | 39 ++++++++++++++++++++++ 11 files changed, 144 insertions(+), 19 deletions(-) create mode 100644 pkg/catalog/compiler/base/options_test.go create mode 100644 pkg/planner/node_arrow_ingest_test.go diff --git a/pkg/catalog/compiler/base/options.go b/pkg/catalog/compiler/base/options.go index 4f826d54..de6cb031 100644 --- a/pkg/catalog/compiler/base/options.go +++ b/pkg/catalog/compiler/base/options.go @@ -87,17 +87,33 @@ type EngineCapabilities struct { Insert EngineInsertCapabilities Update EngineUpdateCapabilities Delete EngineDeleteCapabilities - Ingest EngineDeleteCapabilities - // options: only insert / merge ... + Ingest EngineIngestCapabilities } type EngineInsertCapabilities struct { Insert bool - Ingest bool Returning bool InsertReferences bool } +type EngineIngestCapabilities struct { + // Insert enables append-only INSERT ... SELECT ingest. + Insert bool + // Merge enables MERGE INTO ingest and requires Insert support. + Merge bool +} + +// Available reports whether the engine supports at least one ingest mode. +func (c EngineIngestCapabilities) Available() bool { + return c.Insert || c.Merge +} + +// Valid reports whether the ingest modes form a supported combination. +// Merge ingest builds on insert semantics and cannot be enabled on its own. +func (c EngineIngestCapabilities) Valid() bool { + return c.Insert || !c.Merge +} + type EngineUpdateCapabilities struct { Update bool UpdatePKColumns bool diff --git a/pkg/catalog/compiler/base/options_test.go b/pkg/catalog/compiler/base/options_test.go new file mode 100644 index 00000000..579bc5e6 --- /dev/null +++ b/pkg/catalog/compiler/base/options_test.go @@ -0,0 +1,28 @@ +package base + +import "testing" + +func TestEngineIngestCapabilities(t *testing.T) { + tests := []struct { + name string + caps EngineIngestCapabilities + available bool + valid bool + }{ + {name: "disabled", caps: EngineIngestCapabilities{}, available: false, valid: true}, + {name: "insert", caps: EngineIngestCapabilities{Insert: true}, available: true, valid: true}, + {name: "insert and merge", caps: EngineIngestCapabilities{Insert: true, Merge: true}, available: true, valid: true}, + {name: "merge without insert", caps: EngineIngestCapabilities{Merge: true}, available: true, valid: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.caps.Available(); got != tt.available { + t.Fatalf("Available() = %t, want %t", got, tt.available) + } + if got := tt.caps.Valid(); got != tt.valid { + t.Fatalf("Valid() = %t, want %t", got, tt.valid) + } + }) + } +} diff --git a/pkg/catalog/compiler/options.go b/pkg/catalog/compiler/options.go index e5e9f2ca..fd1235b6 100644 --- a/pkg/catalog/compiler/options.go +++ b/pkg/catalog/compiler/options.go @@ -11,6 +11,9 @@ type EngineCapabilities = base.EngineCapabilities // EngineInsertCapabilities is an alias for base.EngineInsertCapabilities. type EngineInsertCapabilities = base.EngineInsertCapabilities +// EngineIngestCapabilities is an alias for base.EngineIngestCapabilities. +type EngineIngestCapabilities = base.EngineIngestCapabilities + // EngineUpdateCapabilities is an alias for base.EngineUpdateCapabilities. type EngineUpdateCapabilities = base.EngineUpdateCapabilities diff --git a/pkg/engines/airport.go b/pkg/engines/airport.go index a8ea22ac..2f2617b0 100644 --- a/pkg/engines/airport.go +++ b/pkg/engines/airport.go @@ -26,7 +26,7 @@ func (e *AirportEngine) Type() Type { func (e *AirportEngine) Capabilities() *compiler.EngineCapabilities { cap := e.DuckDB.Capabilities() cap.General.SupportDefaultSequences = false - cap.Insert.Ingest = false + cap.Ingest = compiler.EngineIngestCapabilities{} return cap } diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index 6fb9d3c2..b6c7611f 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -71,10 +71,12 @@ func (e *DuckDB) Capabilities() *compiler.EngineCapabilities { }, Insert: compiler.EngineInsertCapabilities{ Insert: true, - Ingest: true, Returning: true, InsertReferences: true, }, + Ingest: compiler.EngineIngestCapabilities{ + Insert: true, + }, Update: compiler.EngineUpdateCapabilities{ Update: true, UpdatePKColumns: true, diff --git a/pkg/engines/ducklake.go b/pkg/engines/ducklake.go index f8ca61e5..2c340073 100644 --- a/pkg/engines/ducklake.go +++ b/pkg/engines/ducklake.go @@ -31,7 +31,7 @@ func (e *DuckLake) Capabilities() *compiler.EngineCapabilities { dbCaps := e.duckdb.Capabilities() caps := *dbCaps // defensive copy caps.General.SupportTimeTravel = true - caps.Insert.Ingest = false + caps.Ingest = compiler.EngineIngestCapabilities{} return &caps } diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index 4bfa6fc1..48f69225 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -76,16 +76,25 @@ type EngineTypeCaster interface { type EngineArrowIngestCaster interface { Engine // ArrowIngestSelectExpr maps one Arrow-view column to a DuckDB staging SELECT - // expression shaped for this target engine. + // expression using canonical DuckDB value types. // Example: for a Geometry field, arrowField extension "geoarrow.geojson", and - // sourceExpr `geom_geojson`, DuckDB returns `ST_GeomFromGeoJSON(geom_geojson)`, - // and Postgres returns the same DuckDB staging geometry expression. + // sourceExpr `geom_geojson`, this returns + // `ST_GeomFromGeoJSON(geom_geojson)` as a DuckDB GEOMETRY expression. ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) // ArrowIngestLiteralExpr returns a DuckDB-compatible literal/expression for - // non-Arrow values mixed into the ingest SELECT, shaped for this target. + // non-Arrow values mixed into the ingest SELECT. ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) } +// EngineIngestTargetCaster is implemented by engines whose ingest target +// cannot consume canonical DuckDB staging values directly. +type EngineIngestTargetCaster interface { + Engine + // CastIngestValueToTarget converts a DuckDB staging SELECT expression into + // the representation accepted by the target source during ingest. + CastIngestValueToTarget(field *ast.Field, stagingExpr string) (string, error) +} + type EngineVectorDistanceCalculator interface { VectorDistanceSQL(sql, distMetric string, vector types.Vector, params []any) (string, []any, error) } diff --git a/pkg/engines/iceberg.go b/pkg/engines/iceberg.go index da67b53b..4bac4666 100644 --- a/pkg/engines/iceberg.go +++ b/pkg/engines/iceberg.go @@ -33,7 +33,7 @@ func (e *Iceberg) Capabilities() *compiler.EngineCapabilities { caps.General.SupportTimeTravel = true // DuckDB Iceberg extension doesn't support targeted inserts (INSERT INTO tbl(col1,col2) VALUES ...) caps.Insert.Insert = false - caps.Insert.Ingest = false + caps.Ingest = compiler.EngineIngestCapabilities{} caps.Insert.Returning = false caps.Insert.InsertReferences = false return &caps diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 8d01146d..95661499 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -46,10 +46,12 @@ func (e *Postgres) Capabilities() *compiler.EngineCapabilities { }, Insert: compiler.EngineInsertCapabilities{ Insert: true, - Ingest: true, Returning: true, InsertReferences: true, }, + Ingest: compiler.EngineIngestCapabilities{ + Insert: true, + }, Update: compiler.EngineUpdateCapabilities{ Update: true, UpdatePKColumns: true, diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 2b87adfd..8538eea1 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -39,7 +39,14 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if err != nil { return nil, err } - if caps := engine.Capabilities(); caps == nil || !caps.Insert.Ingest { + caps := engine.Capabilities() + if caps == nil { + return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) + } + if !caps.Ingest.Valid() { + return nil, fmt.Errorf("engine %q has invalid ingest capabilities: merge requires insert", engine.Type()) + } + if !caps.Ingest.Available() { return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) } ingestEngine, ok := engine.(engines.EngineArrowIngestCaster) @@ -315,8 +322,8 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e field := ingestASTField(info, c.Field, c.FieldDef) // Build the Arrow ingest SELECT expression for the target // GraphQL/DB field. - // Examples: JSON to_json(...), DuckDB geometry expressions, - // or Postgres HEXWKB text for PostGIS columns. + // Examples: JSON to_json(...) or DuckDB staging geometry + // expressions used for both DuckDB and attached Postgres targets. value, err := engine.ArrowIngestSelectExpr(field, c.ArrowField, value) if err != nil { return "", nil, err @@ -347,12 +354,11 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e } // Arrow ingest SELECT expressions are evaluated by DuckDB because // the temporary Arrow view is registered on a DuckDB connection. - // Target engines shape column values above; default/auth helper - // expressions must still be valid in the DuckDB staging SELECT. + // Default/auth helper expressions must therefore use the same canonical + // DuckDB staging types before optional target casting is applied below. if err := mutation.AppendInsertSQLExpression(fieldValues, perm.AuthVars(ctx), engines.NewArrowIngestStagingBuilder()); err != nil { return "", nil, err } - var targetFields, selectExprs []string for _, c := range columns { // targetFields are DB table columns. FieldSourceName applies the @@ -360,7 +366,11 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e targetFields = append(targetFields, c.Field.FieldSourceName("", true)) // selectExprs are evaluated from the DuckDB Arrow view and must // stay in the same order as targetFields. - selectExprs = append(selectExprs, fieldValues[c.Field.Name]) + expr, err := castIngestValueToTarget(engine, ingestASTField(info, c.Field, c.FieldDef), fieldValues[c.Field.Name]) + if err != nil { + return "", nil, err + } + selectExprs = append(selectExprs, expr) delete(fieldValues, c.Field.Name) } for _, fieldInfo := range mutation.Fields() { @@ -373,6 +383,14 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e if fieldInfo.FieldSourceName("", false) == "-" { continue } + fieldDef := info.Definition().Fields.ForName(fieldInfo.Name) + if fieldDef == nil { + return "", nil, fmt.Errorf("ingest field %q definition not found in data object %q", fieldInfo.Name, info.Name) + } + expr, err := castIngestValueToTarget(engine, ingestASTField(info, fieldInfo, fieldDef), expr) + if err != nil { + return "", nil, err + } targetFields = append(targetFields, fieldInfo.FieldSourceName("", true)) selectExprs = append(selectExprs, expr) delete(fieldValues, fieldInfo.Name) @@ -394,6 +412,14 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e } } +func castIngestValueToTarget(engine engines.EngineArrowIngestCaster, field *ast.Field, stagingExpr string) (string, error) { + targetCaster, ok := engine.(engines.EngineIngestTargetCaster) + if !ok { + return stagingExpr, nil + } + return targetCaster.CastIngestValueToTarget(field, stagingExpr) +} + func ingestASTField(info *sdl.Object, fieldInfo *sdl.Field, fieldDef *ast.FieldDefinition) *ast.Field { return &ast.Field{ Name: fieldInfo.Name, diff --git a/pkg/planner/node_arrow_ingest_test.go b/pkg/planner/node_arrow_ingest_test.go new file mode 100644 index 00000000..87d8cdcf --- /dev/null +++ b/pkg/planner/node_arrow_ingest_test.go @@ -0,0 +1,39 @@ +package planner + +import ( + "testing" + + "github.com/hugr-lab/query-engine/pkg/engines" + "github.com/vektah/gqlparser/v2/ast" +) + +type testIngestTargetCaster struct { + *engines.DuckDB +} + +func (e *testIngestTargetCaster) CastIngestValueToTarget(_ *ast.Field, stagingExpr string) (string, error) { + return "target_cast(" + stagingExpr + ")", nil +} + +func TestCastIngestValueToTarget(t *testing.T) { + t.Run("direct target", func(t *testing.T) { + got, err := castIngestValueToTarget(engines.NewDuckDB(), nil, "staging_value") + if err != nil { + t.Fatal(err) + } + if got != "staging_value" { + t.Fatalf("got %q, want unchanged staging expression", got) + } + }) + + t.Run("target caster", func(t *testing.T) { + engine := &testIngestTargetCaster{DuckDB: engines.NewDuckDB()} + got, err := castIngestValueToTarget(engine, nil, "staging_value") + if err != nil { + t.Fatal(err) + } + if got != "target_cast(staging_value)" { + t.Fatalf("got %q, want target cast expression", got) + } + }) +} From 630e39aef83131d152ee9d2502365ef293f1ed2a Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 20 Jun 2026 11:30:40 +0400 Subject: [PATCH 30/36] ipc ingest --- ipc-ingest.go | 2 +- pkg/arrow-ingest/source.go | 19 ++++++----- pkg/arrow-ingest/source_test.go | 20 +++++++++++ pkg/db/pool.go | 28 ++++++++++++--- pkg/db/pool_test.go | 58 ++++++++++++++++++++++++++++++++ pkg/planner/node_arrow_ingest.go | 16 +++++---- 6 files changed, 122 insertions(+), 21 deletions(-) create mode 100644 pkg/arrow-ingest/source_test.go diff --git a/ipc-ingest.go b/ipc-ingest.go index 6400e9d4..15452056 100644 --- a/ipc-ingest.go +++ b/ipc-ingest.go @@ -34,7 +34,7 @@ type ingestErrorBody struct { // ipcIngestHandler accepts an Apache Arrow IPC stream in the request body and // inserts it into a table data object. The planner resolves the target schema, // validates insert inputs/permissions, casts Arrow values, and builds the -// INSERT FROM SELECT statement over a temporary Arrow view. +// INSERT FROM SELECT statement over a request-scoped Arrow view. func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") diff --git a/pkg/arrow-ingest/source.go b/pkg/arrow-ingest/source.go index 07a9eacf..2ad1b3dc 100644 --- a/pkg/arrow-ingest/source.go +++ b/pkg/arrow-ingest/source.go @@ -6,30 +6,28 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" + "github.com/google/uuid" ) -const DefaultViewName = "_hugr_arrow_view" +const viewNamePrefix = "_hugr_arrow_view_" // Source is the shared contract between the IPC ingest handler, planner, and -// DB executor. The planner builds SQL against ViewName; the DB executor -// registers Reader under the same per-connection DuckDB view name. +// DB executor. The planner builds SQL against the source view name; the DB +// executor registers Reader under that same globally unique DuckDB view name. type Source struct { Reader array.RecordReader - ViewName string + viewName string } func NewSource(reader array.RecordReader) Source { return Source{ Reader: reader, - ViewName: DefaultViewName, + viewName: viewNamePrefix + strings.ReplaceAll(uuid.NewString(), "-", ""), } } func (s Source) View() string { - if s.ViewName == "" { - return DefaultViewName - } - return s.ViewName + return s.viewName } // NeedsSpatial reports whether the Arrow source carries geometry extension @@ -59,6 +57,9 @@ func (s Source) RegisterView(arrowConn interface { if s.Reader == nil { return nil, fmt.Errorf("missing arrow reader") } + if s.View() == "" { + return nil, fmt.Errorf("missing arrow view name") + } return arrowConn.RegisterView(s.Reader, s.View()) } diff --git a/pkg/arrow-ingest/source_test.go b/pkg/arrow-ingest/source_test.go new file mode 100644 index 00000000..69c197f0 --- /dev/null +++ b/pkg/arrow-ingest/source_test.go @@ -0,0 +1,20 @@ +package arrowingest + +import ( + "strings" + "testing" +) + +func TestNewSourceUsesUniqueViewName(t *testing.T) { + first := NewSource(nil) + second := NewSource(nil) + + if first.View() == second.View() { + t.Fatalf("sources share view name %q", first.View()) + } + for _, name := range []string{first.View(), second.View()} { + if !strings.HasPrefix(name, viewNamePrefix) { + t.Fatalf("view name %q does not start with %q", name, viewNamePrefix) + } + } +} diff --git a/pkg/db/pool.go b/pkg/db/pool.go index 096eb6a6..0289de36 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -4,9 +4,11 @@ import ( "context" "database/sql" "database/sql/driver" + "errors" "fmt" "strings" "sync" + "time" "github.com/duckdb/duckdb-go/v2" arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" @@ -222,12 +224,15 @@ func (p *Pool) Arrow(ctx context.Context) (*Arrow, error) { }, nil } -// ExecArrowIngest registers source.Reader as source.ViewName and executes query -// on the same DuckDB driver connection, where the temporary Arrow view is visible. -func (p *Pool) ExecArrowIngest(ctx context.Context, source arrowingest.Source, query string) (sql.Result, error) { +// ExecArrowIngest registers source.Reader as a globally named DuckDB view, +// executes query, then drops the view before releasing the Arrow stream. +func (p *Pool) ExecArrowIngest(ctx context.Context, source arrowingest.Source, query string) (result sql.Result, err error) { if source.Reader == nil { return nil, fmt.Errorf("missing arrow reader") } + if source.View() == "" { + return nil, fmt.Errorf("missing arrow view name") + } ar, err := p.Arrow(ctx) if err != nil { return nil, err @@ -247,11 +252,26 @@ func (p *Pool) ExecArrowIngest(ctx context.Context, source arrowingest.Source, q if err != nil { return nil, fmt.Errorf("register arrow view: %w", err) } - defer release() + defer func() { + // The view created by duckdb_arrow_scan is global to the DuckDB + // database instance. Cleanup must outlive a canceled request context + // and must happen before the Arrow stream is released. + cleanupCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Second) + defer cancel() + _, cleanupErr := execer.ExecContext(cleanupCtx, "DROP VIEW IF EXISTS "+quoteIdentifier(source.View()), nil) + release() + if cleanupErr != nil { + err = errors.Join(err, fmt.Errorf("drop arrow ingest view %q: %w", source.View(), cleanupErr)) + } + }() return execer.ExecContext(ctx, query, nil) } +func quoteIdentifier(name string) string { + return `"` + strings.ReplaceAll(name, `"`, `""`) + `"` +} + func (p *Pool) RegisterScalarFunction(ctx context.Context, function ScalarFunction) error { return RegisterScalarFunction(ctx, p, function) } diff --git a/pkg/db/pool_test.go b/pkg/db/pool_test.go index c095422b..f749cb13 100644 --- a/pkg/db/pool_test.go +++ b/pkg/db/pool_test.go @@ -4,6 +4,11 @@ import ( "context" "sync" "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/memory" + arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" ) func TestNewPool(t *testing.T) { @@ -155,6 +160,59 @@ func TestPool_Arrow_Concurrent(t *testing.T) { wg.Wait() } +func TestPool_ExecArrowIngestDropsView(t *testing.T) { + pool, err := NewPool("") + if err != nil { + t.Fatal(err) + } + defer pool.Close() + + ctx := context.Background() + if _, err := pool.Exec(ctx, "CREATE TABLE ingest_target (value INTEGER)"); err != nil { + t.Fatal(err) + } + + schema := arrow.NewSchema([]arrow.Field{{Name: "value", Type: arrow.PrimitiveTypes.Int32}}, nil) + builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) + defer builder.Release() + builder.Field(0).(*array.Int32Builder).Append(42) + record := builder.NewRecordBatch() + defer record.Release() + reader, err := array.NewRecordReader(schema, []arrow.RecordBatch{record}) + if err != nil { + t.Fatal(err) + } + defer reader.Release() + + source := arrowingest.NewSource(reader) + query := "INSERT INTO ingest_target SELECT * FROM " + quoteIdentifier(source.View()) + if _, err := pool.ExecArrowIngest(ctx, source, query); err != nil { + t.Fatal(err) + } + + conn, err := pool.Conn(ctx) + if err != nil { + t.Fatal(err) + } + defer conn.Close() + + var value int + if err := conn.QueryRow(ctx, "SELECT value FROM ingest_target").Scan(&value); err != nil { + t.Fatal(err) + } + if value != 42 { + t.Fatalf("inserted value = %d, want 42", value) + } + + var views int + if err := conn.QueryRow(ctx, "SELECT count(*) FROM duckdb_views() WHERE view_name = ?", source.View()).Scan(&views); err != nil { + t.Fatal(err) + } + if views != 0 { + t.Fatalf("Arrow ingest view %q remains in the DuckDB catalog", source.View()) + } +} + func Test_print(t *testing.T) { t.Log("[{\"address\":\"ул. Мира - ул. Мичурина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.530669834,66.644954681]}\",\"id\":\"0381c536-6efc-49e8-b2ec-87303ba0d4f4\",\"isManaged\":false,\"name\":\"C-005\",\"number\":\"C005\"},{\"address\":\"Пермь, улица КИМ, 72\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[58.021034275,56.293535829]}\",\"id\":\"9cc382ea-5cc5-4b50-bb4f-11ea2de0a393\",\"isManaged\":false,\"name\":\"ДК КДУ-КМД\",\"number\":\"2001\"},{\"address\":\"ул. Мира- Чубынина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.530721108,66.613830328]}\",\"id\":\"2472a103-5e54-49f5-aa78-e690785ea83a\",\"isManaged\":false,\"name\":\"ИДК1-02\",\"number\":\"002\"},{\"address\":\"ул. Чубынина - Мира\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.531680341,66.612414122]}\",\"id\":\"6aa3eaa4-cfd9-431d-a45f-dc88820604aa\",\"isManaged\":false,\"name\":\"ИДК1-03\",\"number\":\"103\"},{\"address\":\" Арктическая-Губкина-Матросова\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.538271784,66.62610814]}\",\"id\":\"506b8ea2-615c-429d-bb29-bd0dc15e9996\",\"isManaged\":true,\"name\":\"ИДКЗ-01\",\"number\":\"301\"},{\"address\":\" Губкина - Зои Космодемьянской\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.537984428,66.633174419]}\",\"id\":\"841dd2cb-64a1-4d53-b76c-74869a4d01e6\",\"isManaged\":true,\"name\":\"ИДКЗ-02\",\"number\":\"302\"},{\"address\":\"Броднева -Губкина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.537782631,66.639154851]}\",\"id\":\"345ce8d4-65b4-46d3-92e5-b2a48876413e\",\"isManaged\":false,\"name\":\"ИДКЗ-03\",\"number\":\"303\"},{\"address\":\" Матросова - Подшибякина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.536092423,66.625503302]}\",\"id\":\"9f7591dd-aa75-4d29-8627-e8e9486986c9\",\"isManaged\":false,\"name\":\"ИДКЗ-04\",\"number\":\"304\"},{\"address\":\" Космодемьянской - Подшибякина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.535844447,66.632648706]}\",\"id\":\"bb0ad7e1-50e3-4982-96c7-70dc2ea54fb8\",\"isManaged\":false,\"name\":\"ИДКЗ-05\",\"number\":\"305\"},{\"address\":\"Броднева-Подшибякина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.535609528,66.638635397]}\",\"id\":\"eac5e822-ce2a-4425-8a27-47b1f88d9836\",\"isManaged\":false,\"name\":\"ИДКЗ-06\",\"number\":\"306\"},{\"address\":\"Ямальская - Матросова\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.534233819,66.625117064]}\",\"id\":\"af482037-3796-4d53-b59c-7e82dfeb454d\",\"isManaged\":false,\"name\":\"ИДКЗ-07\",\"number\":\"307\"},{\"address\":\"Ямальская - Космодемьянской\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.534003136,66.632198095]}\",\"id\":\"86952cf0-947d-4b9e-84c8-cfa44a3dd5a7\",\"isManaged\":false,\"name\":\"ИДКЗ-08\",\"number\":\"308\"},{\"address\":\"Ямальская- Броднева\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.533776997,66.638131142]}\",\"id\":\"498a3164-6e27-4908-b811-5a6b710be17d\",\"isManaged\":false,\"name\":\"ИДКЗ-09\",\"number\":\"309\"},{\"address\":\"улица Чубынина/улица Республики\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.529226826,66.616048263]}\",\"id\":\"d538df50-9193-4a1e-a02c-89f6fd4854cb\",\"isManaged\":false,\"name\":\"Инвиан 1\",\"number\":\"Инв-01\"},{\"address\":\"Пермь\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[58.019055099,56.290804077]}\",\"id\":\"962efde4-62e2-4128-a438-18f43f8b56e4\",\"isManaged\":true,\"name\":\"Инвиан-02\",\"number\":\"Инв-02\"},{\"address\":\"ул. Богдана Кнунянца\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.549909425,66.579353213]}\",\"id\":\"6d736df2-e46a-4698-9ac2-b02a99eaefd6\",\"isManaged\":false,\"name\":\"С-001\",\"number\":\"С001\"},{\"address\":\"ул. Почтовая - просп. Молодежи\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.538073574,66.596063375]}\",\"id\":\"7db4d259-75a7-4e68-a765-e6c0e91573e4\",\"isManaged\":false,\"name\":\"С-002\",\"number\":\"С002\"},{\"address\":\"ул. Объездная - просп. Молодежи\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.542511344,66.621533632]}\",\"id\":\"bafa44ff-e40d-460c-9f0c-3c3732833245\",\"isManaged\":false,\"name\":\"С-003\",\"number\":\"С003\"},{\"address\":\"ул. Республики - ул. Подшибякина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.534745795,66.652078629]}\",\"id\":\"3fdb1c7f-d69e-40fd-a29b-316dc95faf65\",\"isManaged\":false,\"name\":\"С-004\",\"number\":\"С004\"},{\"address\":\"ул. Броднева\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.531127024,66.637487411]}\",\"id\":\"c4558a8e-4752-4131-b8a5-1bcf97c925e3\",\"isManaged\":false,\"name\":\"С-006\",\"number\":\"С006\"},{\"address\":\"ул. Артеева\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.531383389,66.631565094]}\",\"id\":\"62a25f70-b9c3-41a3-acc4-46344ab75e08\",\"isManaged\":false,\"name\":\"С-007\",\"number\":\"С007\"},{\"address\":\"ул. Мира\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.53162266,66.624526978]}\",\"id\":\"dc0e2a05-1df5-4854-ab12-78623105abae\",\"isManaged\":false,\"name\":\"С-008\",\"number\":\"С008\"},{\"address\":\"Пермь, улица КИМ, 74А\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[58.019861248,56.291919989]}\",\"id\":\"cea96a45-b726-4e9e-8c86-a507f020257b\",\"isManaged\":true,\"name\":\"Спектр2-01\",\"number\":\"С-01\"},{\"address\":\"Пермь\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[58.020333595,56.292587678]}\",\"id\":\"6cc55c55-87f6-4918-a7ac-46172539d73b\",\"isManaged\":true,\"name\":\"Спектр2-02\",\"number\":\"С-02\"}]") } diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 8538eea1..1cd062c4 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -30,6 +30,9 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if source.Reader == nil { return nil, fmt.Errorf("missing arrow reader") } + if source.View() == "" { + return nil, fmt.Errorf("missing arrow view name") + } info, mutationField, err := resolveIngestTarget(ctx, provider, dataObject) if err != nil { @@ -291,7 +294,7 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info } // ingestNode builds the INSERT ... SELECT statement that copies rows from the -// temporary DuckDB Arrow view into the target DB table. +// request-scoped DuckDB Arrow view into the target DB table. // // - info is the GraphQL data object plus its DB table/column mapping. // - mutation is the GraphQL insert mutation used for insert defaults. @@ -301,15 +304,15 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info // columns by resolveIngestColumns. // - permissionData contains extra GraphQL input values injected by the // permission layer; they do not come from the Arrow stream. -// - arrowViewName is the per-connection DuckDB view registered from the -// Arrow reader during execution. +// - arrowViewName is the globally unique DuckDB view registered from the +// Arrow reader for this ingest execution. func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.EngineArrowIngestCaster, columns []ingestColumn, permissionData map[string]any, arrowViewName string) *QueryPlanNode { return &QueryPlanNode{ Name: "ingest_" + info.Name, CollectFunc: func(node *QueryPlanNode, children Results, params []any) (string, []any, error) { // fieldValues is keyed by GraphQL field name. Each value is a SQL // expression evaluated in the SELECT part of INSERT ... SELECT. - // The expression may reference an Arrow column from the temporary + // The expression may reference an Arrow column from the ingest // DuckDB view, or it may be a constant/default/permission value. fieldValues := make(map[string]string, len(columns)) for _, c := range columns { @@ -353,7 +356,7 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e fieldValues[name] = sqlValue } // Arrow ingest SELECT expressions are evaluated by DuckDB because - // the temporary Arrow view is registered on a DuckDB connection. + // the Arrow view is registered in DuckDB. // Default/auth helper expressions must therefore use the same canonical // DuckDB staging types before optional target casting is applied below. if err := mutation.AppendInsertSQLExpression(fieldValues, perm.AuthVars(ctx), engines.NewArrowIngestStagingBuilder()); err != nil { @@ -400,8 +403,7 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e } target := info.SQL(ctx, engines.Ident(info.Catalog)) - // The FROM relation is the fixed temporary Arrow view registered on - // the same DuckDB connection that executes this statement. + // The FROM relation is this ingest request's globally unique Arrow view. return fmt.Sprintf("INSERT INTO %s (%s) SELECT %s FROM %s", target, strings.Join(targetFields, ", "), From 3b5c69d9681ef9537ab85a434ecc7e7bfcb18551 Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 20 Jun 2026 12:02:30 +0400 Subject: [PATCH 31/36] ipc ingest --- ipc-ingest.go | 4 ++-- pkg/arrow-ingest/source_test.go | 20 ---------------- .../source.go => db/arrow_ingest_source.go} | 23 +++++++++---------- pkg/db/arrow_ingest_source_test.go | 20 ++++++++++++++++ pkg/db/pool.go | 3 +-- pkg/db/pool_test.go | 3 +-- pkg/planner/node_arrow_ingest.go | 4 ++-- pkg/planner/planer.go | 4 ++-- 8 files changed, 39 insertions(+), 42 deletions(-) delete mode 100644 pkg/arrow-ingest/source_test.go rename pkg/{arrow-ingest/source.go => db/arrow_ingest_source.go} (71%) create mode 100644 pkg/db/arrow_ingest_source_test.go diff --git a/ipc-ingest.go b/ipc-ingest.go index 15452056..7799d26b 100644 --- a/ipc-ingest.go +++ b/ipc-ingest.go @@ -10,8 +10,8 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/ipc" "github.com/apache/arrow-go/v18/arrow/memory" - arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" "github.com/hugr-lab/query-engine/pkg/auth" + "github.com/hugr-lab/query-engine/pkg/db" "github.com/hugr-lab/query-engine/pkg/perm" ) @@ -75,7 +75,7 @@ func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { return } defer reader.Release() - source := arrowingest.NewSource(reader) + source := db.NewArrowIngestSource(reader) plan, err := s.planner.PlanArrowIngest(ctx, s.schema.Provider(), dataObject, source) if err != nil { diff --git a/pkg/arrow-ingest/source_test.go b/pkg/arrow-ingest/source_test.go deleted file mode 100644 index 69c197f0..00000000 --- a/pkg/arrow-ingest/source_test.go +++ /dev/null @@ -1,20 +0,0 @@ -package arrowingest - -import ( - "strings" - "testing" -) - -func TestNewSourceUsesUniqueViewName(t *testing.T) { - first := NewSource(nil) - second := NewSource(nil) - - if first.View() == second.View() { - t.Fatalf("sources share view name %q", first.View()) - } - for _, name := range []string{first.View(), second.View()} { - if !strings.HasPrefix(name, viewNamePrefix) { - t.Fatalf("view name %q does not start with %q", name, viewNamePrefix) - } - } -} diff --git a/pkg/arrow-ingest/source.go b/pkg/db/arrow_ingest_source.go similarity index 71% rename from pkg/arrow-ingest/source.go rename to pkg/db/arrow_ingest_source.go index 2ad1b3dc..ea3b6320 100644 --- a/pkg/arrow-ingest/source.go +++ b/pkg/db/arrow_ingest_source.go @@ -1,4 +1,4 @@ -package arrowingest +package db import ( "fmt" @@ -9,30 +9,29 @@ import ( "github.com/google/uuid" ) -const viewNamePrefix = "_hugr_arrow_view_" +const arrowIngestViewNamePrefix = "_hugr_arrow_view_" -// Source is the shared contract between the IPC ingest handler, planner, and -// DB executor. The planner builds SQL against the source view name; the DB -// executor registers Reader under that same globally unique DuckDB view name. -type Source struct { +// ArrowIngestSource binds an Arrow reader to the globally unique DuckDB view +// name used by both the planner and the ingest executor. +type ArrowIngestSource struct { Reader array.RecordReader viewName string } -func NewSource(reader array.RecordReader) Source { - return Source{ +func NewArrowIngestSource(reader array.RecordReader) ArrowIngestSource { + return ArrowIngestSource{ Reader: reader, - viewName: viewNamePrefix + strings.ReplaceAll(uuid.NewString(), "-", ""), + viewName: arrowIngestViewNamePrefix + strings.ReplaceAll(uuid.NewString(), "-", ""), } } -func (s Source) View() string { +func (s ArrowIngestSource) View() string { return s.viewName } // NeedsSpatial reports whether the Arrow source carries geometry extension // metadata that requires DuckDB's spatial extension before registering the view. -func (s Source) NeedsSpatial() bool { +func (s ArrowIngestSource) NeedsSpatial() bool { if s.Reader == nil || s.Reader.Schema() == nil { return false } @@ -51,7 +50,7 @@ func (s Source) NeedsSpatial() bool { } // RegisterView registers the source reader under the source view name. -func (s Source) RegisterView(arrowConn interface { +func (s ArrowIngestSource) RegisterView(arrowConn interface { RegisterView(reader array.RecordReader, viewName string) (func(), error) }) (func(), error) { if s.Reader == nil { diff --git a/pkg/db/arrow_ingest_source_test.go b/pkg/db/arrow_ingest_source_test.go new file mode 100644 index 00000000..51e2b554 --- /dev/null +++ b/pkg/db/arrow_ingest_source_test.go @@ -0,0 +1,20 @@ +package db + +import ( + "strings" + "testing" +) + +func TestNewArrowIngestSourceUsesUniqueViewName(t *testing.T) { + first := NewArrowIngestSource(nil) + second := NewArrowIngestSource(nil) + + if first.View() == second.View() { + t.Fatalf("sources share view name %q", first.View()) + } + for _, name := range []string{first.View(), second.View()} { + if !strings.HasPrefix(name, arrowIngestViewNamePrefix) { + t.Fatalf("view name %q does not start with %q", name, arrowIngestViewNamePrefix) + } + } +} diff --git a/pkg/db/pool.go b/pkg/db/pool.go index 0289de36..227337ce 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -11,7 +11,6 @@ import ( "time" "github.com/duckdb/duckdb-go/v2" - arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" ) type Config struct { @@ -226,7 +225,7 @@ func (p *Pool) Arrow(ctx context.Context) (*Arrow, error) { // ExecArrowIngest registers source.Reader as a globally named DuckDB view, // executes query, then drops the view before releasing the Arrow stream. -func (p *Pool) ExecArrowIngest(ctx context.Context, source arrowingest.Source, query string) (result sql.Result, err error) { +func (p *Pool) ExecArrowIngest(ctx context.Context, source ArrowIngestSource, query string) (result sql.Result, err error) { if source.Reader == nil { return nil, fmt.Errorf("missing arrow reader") } diff --git a/pkg/db/pool_test.go b/pkg/db/pool_test.go index f749cb13..880abb5c 100644 --- a/pkg/db/pool_test.go +++ b/pkg/db/pool_test.go @@ -8,7 +8,6 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/memory" - arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" ) func TestNewPool(t *testing.T) { @@ -184,7 +183,7 @@ func TestPool_ExecArrowIngestDropsView(t *testing.T) { } defer reader.Release() - source := arrowingest.NewSource(reader) + source := NewArrowIngestSource(reader) query := "INSERT INTO ingest_target SELECT * FROM " + quoteIdentifier(source.View()) if _, err := pool.ExecArrowIngest(ctx, source, query); err != nil { t.Fatal(err) diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 1cd062c4..9e395c46 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -6,11 +6,11 @@ import ( "strings" "github.com/apache/arrow-go/v18/arrow" - arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" "github.com/hugr-lab/query-engine/pkg/auth" "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" + "github.com/hugr-lab/query-engine/pkg/db" "github.com/hugr-lab/query-engine/pkg/engines" "github.com/hugr-lab/query-engine/pkg/perm" "github.com/vektah/gqlparser/v2/ast" @@ -23,7 +23,7 @@ type ingestColumn struct { InputDef *ast.FieldDefinition } -func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, source arrowingest.Source) (*QueryPlanNode, error) { +func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, source db.ArrowIngestSource) (*QueryPlanNode, error) { if dataObject == "" { return nil, fmt.Errorf("missing data object") } diff --git a/pkg/planner/planer.go b/pkg/planner/planer.go index 0fd0a74d..b17fb41a 100644 --- a/pkg/planner/planer.go +++ b/pkg/planner/planer.go @@ -4,9 +4,9 @@ import ( "context" "errors" - arrowingest "github.com/hugr-lab/query-engine/pkg/arrow-ingest" "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" + "github.com/hugr-lab/query-engine/pkg/db" "github.com/hugr-lab/query-engine/pkg/engines" "github.com/hugr-lab/query-engine/types" "github.com/vektah/gqlparser/v2/ast" @@ -71,7 +71,7 @@ func (s *Service) Plan(ctx context.Context, provider catalog.Provider, query *as // The Arrow source is part of this planning API because its schema drives column // resolution and ingest casting, while its view name is the staging relation used // in the generated INSERT ... SELECT. -func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, source arrowingest.Source) (*QueryPlan, error) { +func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, source db.ArrowIngestSource) (*QueryPlan, error) { node, err := ingestRootNode(ctx, provider, s.engines, dataObject, source) if err != nil { return nil, err From 96ddd83d1851fbf3828c62770de6aa2ea7897810 Mon Sep 17 00:00:00 2001 From: vadim Date: Sat, 20 Jun 2026 12:55:22 +0400 Subject: [PATCH 32/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 156 ++++++++++++++++++ .../schemas/duck_ingest/schema.graphql | 13 ++ .../ingest-postgres/ingest_postgres_test.go | 143 ++++++++++++++++ .../ingest-postgres/testdata/init.sql | 13 ++ .../testdata/schemas/pg_ingest/schema.graphql | 13 ++ pkg/engines/arrow_ingest.go | 7 +- pkg/engines/arrow_ingest_test.go | 44 ++++- pkg/engines/duckdb.go | 26 ++- pkg/engines/postgres.go | 16 +- 9 files changed, 395 insertions(+), 36 deletions(-) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index e2c05b89..cf04a30f 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -134,6 +134,19 @@ func setupEnv(t *testing.T) *ingestEnv { is_active BOOLEAN NOT NULL DEFAULT true, owner_id BIGINT, payload JSON, + payload_large_string JSON, + payload_string_view JSON, + payload_binary JSON, + payload_large_binary JSON, + payload_binary_view JSON, + payload_struct JSON, + payload_list JSON, + payload_large_list JSON, + payload_fixed_size_list JSON, + payload_list_view JSON, + payload_large_list_view JSON, + payload_map JSON, + payload_scalar JSON, created_at TIMESTAMPTZ NOT NULL DEFAULT now(), geom GEOMETRY, geom_wkt GEOMETRY, @@ -278,6 +291,136 @@ func makeEventsRecord(t *testing.T, names []string, values []float64, active []b return b.NewRecord() } +var jsonPhysicalTypeColumns = []string{ + "payload", + "payload_large_string", + "payload_string_view", + "payload_binary", + "payload_large_binary", + "payload_binary_view", + "payload_struct", + "payload_list", + "payload_large_list", + "payload_fixed_size_list", + "payload_list_view", + "payload_large_list_view", + "payload_map", + "payload_scalar", +} + +func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + structType := arrow.StructOf( + arrow.Field{Name: "kind", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "count", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, + ) + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "payload_large_string", Type: arrow.BinaryTypes.LargeString, Nullable: false}, + {Name: "payload_string_view", Type: arrow.BinaryTypes.StringView, Nullable: false}, + {Name: "payload_binary", Type: arrow.BinaryTypes.Binary, Nullable: false}, + {Name: "payload_large_binary", Type: arrow.BinaryTypes.LargeBinary, Nullable: false}, + {Name: "payload_binary_view", Type: arrow.BinaryTypes.BinaryView, Nullable: false}, + {Name: "payload_struct", Type: structType, Nullable: false}, + {Name: "payload_list", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_large_list", Type: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_fixed_size_list", Type: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_list_view", Type: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_large_list_view", Type: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_map", Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_scalar", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, + }, nil) + + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).Append("json-physical-types") + b.Field(1).(*array.Float64Builder).Append(1) + b.Field(2).(*array.BooleanBuilder).Append(true) + b.Field(3).(*array.StringBuilder).Append(`{"kind":"string"}`) + b.Field(4).(*array.LargeStringBuilder).Append(`{"kind":"large_string"}`) + b.Field(5).(*array.StringViewBuilder).Append(`{"kind":"string_view"}`) + b.Field(6).(*array.BinaryBuilder).Append([]byte(`{"kind":"binary"}`)) + b.Field(7).(*array.BinaryBuilder).Append([]byte(`{"kind":"large_binary"}`)) + b.Field(8).(*array.BinaryViewBuilder).Append([]byte(`{"kind":"binary_view"}`)) + + structBuilder := b.Field(9).(*array.StructBuilder) + structBuilder.Append(true) + structBuilder.FieldBuilder(0).(*array.StringBuilder).Append("struct") + structBuilder.FieldBuilder(1).(*array.Int64Builder).Append(14) + + listBuilder := b.Field(10).(*array.ListBuilder) + listBuilder.Append(true) + listBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{1, 2}, nil) + largeListBuilder := b.Field(11).(*array.LargeListBuilder) + largeListBuilder.Append(true) + largeListBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{3, 4}, nil) + fixedListBuilder := b.Field(12).(*array.FixedSizeListBuilder) + fixedListBuilder.Append(true) + fixedListBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{5, 6}, nil) + listViewBuilder := b.Field(13).(*array.ListViewBuilder) + listViewBuilder.AppendWithSize(true, 2) + listViewBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{7, 8}, nil) + largeListViewBuilder := b.Field(14).(*array.LargeListViewBuilder) + largeListViewBuilder.AppendWithSize(true, 2) + largeListViewBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{9, 10}, nil) + mapBuilder := b.Field(15).(*array.MapBuilder) + mapBuilder.Append(true) + mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues([]string{"a", "b"}, nil) + mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues([]int64{11, 12}, nil) + b.Field(16).(*array.Int64Builder).Append(13) + return b.NewRecordBatch() +} + +func jsonPhysicalTypesExpected() map[string]any { + return map[string]any{ + "name": "json-physical-types", + "payload": map[string]any{"kind": "string"}, + "payload_large_string": map[string]any{"kind": "large_string"}, + "payload_string_view": map[string]any{"kind": "string_view"}, + "payload_binary": map[string]any{"kind": "binary"}, + "payload_large_binary": map[string]any{"kind": "large_binary"}, + "payload_binary_view": map[string]any{"kind": "binary_view"}, + "payload_struct": map[string]any{"kind": "struct", "count": float64(14)}, + "payload_list": []any{float64(1), float64(2)}, + "payload_large_list": []any{float64(3), float64(4)}, + "payload_fixed_size_list": []any{float64(5), float64(6)}, + "payload_list_view": []any{float64(7), float64(8)}, + "payload_large_list_view": []any{float64(9), float64(10)}, + "payload_map": map[string]any{"a": float64(11), "b": float64(12)}, + "payload_scalar": "13", + } +} + +func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, dsName string) { + t.Helper() + query := fmt.Sprintf(`{ + %s { + events(filter: {name: {eq: "json-physical-types"}}) { + name + %s + } + } + }`, dsName, strings.Join(jsonPhysicalTypeColumns, "\n")) + res, err := service.Query(context.Background(), query, nil) + require.NoError(t, err) + defer res.Close() + require.NoErrorf(t, res.Err(), "graphql error for query: %s", query) + + body, err := json.Marshal(res) + require.NoError(t, err) + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + data := payload["data"].(map[string]any) + root := data[dsName].(map[string]any) + rows := root["events"].([]any) + require.Len(t, rows, 1, "response: %s", string(body)) + assert.Equal(t, jsonPhysicalTypesExpected(), rows[0]) +} + // --- Core tests ----------------------------------------------------------- func TestIngest_DuckDB_RoundTrip(t *testing.T) { @@ -337,6 +480,19 @@ func TestIngest_DuckDB_RoundTrip(t *testing.T) { assert.Equal(t, []bool{true, false, true}, gotHasJSON) } +func TestIngest_DuckDB_JSONPhysicalTypes(t *testing.T) { + env := setupEnv(t) + rec := makeJSONPhysicalTypesRecord(t) + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) + require.NoError(t, err) + assert.Equal(t, int64(1), res.Inserted) + expectedColumns := append([]string{"name", "value", "is_active"}, jsonPhysicalTypeColumns...) + assert.ElementsMatch(t, expectedColumns, res.Columns) + assertJSONPhysicalTypesReadThroughHugr(t, env.service, env.dsName) +} + func TestIngest_DuckDB_PermissionData(t *testing.T) { env := setupEnv(t) diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql index 3c410e3f..4bca0a04 100644 --- a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -5,6 +5,19 @@ type events @table(name: "events") { is_active: Boolean! @default(value: "true") owner_id: BigInt payload: JSON + payload_large_string: JSON + payload_string_view: JSON + payload_binary: JSON + payload_large_binary: JSON + payload_binary_view: JSON + payload_struct: JSON + payload_list: JSON + payload_large_list: JSON + payload_fixed_size_list: JSON + payload_list_view: JSON + payload_large_list_view: JSON + payload_map: JSON + payload_scalar: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 4326, type: POINT) geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 6e0c4c30..839977a6 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -265,6 +265,136 @@ func makeEventsRecord(t *testing.T, names []string, values []float64, active []b return b.NewRecord() } +var jsonPhysicalTypeColumns = []string{ + "payload", + "payload_large_string", + "payload_string_view", + "payload_binary", + "payload_large_binary", + "payload_binary_view", + "payload_struct", + "payload_list", + "payload_large_list", + "payload_fixed_size_list", + "payload_list_view", + "payload_large_list_view", + "payload_map", + "payload_scalar", +} + +func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + structType := arrow.StructOf( + arrow.Field{Name: "kind", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "count", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, + ) + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "payload_large_string", Type: arrow.BinaryTypes.LargeString, Nullable: false}, + {Name: "payload_string_view", Type: arrow.BinaryTypes.StringView, Nullable: false}, + {Name: "payload_binary", Type: arrow.BinaryTypes.Binary, Nullable: false}, + {Name: "payload_large_binary", Type: arrow.BinaryTypes.LargeBinary, Nullable: false}, + {Name: "payload_binary_view", Type: arrow.BinaryTypes.BinaryView, Nullable: false}, + {Name: "payload_struct", Type: structType, Nullable: false}, + {Name: "payload_list", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_large_list", Type: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_fixed_size_list", Type: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_list_view", Type: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_large_list_view", Type: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_map", Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), Nullable: false}, + {Name: "payload_scalar", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, + }, nil) + + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).Append("json-physical-types") + b.Field(1).(*array.Float64Builder).Append(1) + b.Field(2).(*array.BooleanBuilder).Append(true) + b.Field(3).(*array.StringBuilder).Append(`{"kind":"string"}`) + b.Field(4).(*array.LargeStringBuilder).Append(`{"kind":"large_string"}`) + b.Field(5).(*array.StringViewBuilder).Append(`{"kind":"string_view"}`) + b.Field(6).(*array.BinaryBuilder).Append([]byte(`{"kind":"binary"}`)) + b.Field(7).(*array.BinaryBuilder).Append([]byte(`{"kind":"large_binary"}`)) + b.Field(8).(*array.BinaryViewBuilder).Append([]byte(`{"kind":"binary_view"}`)) + + structBuilder := b.Field(9).(*array.StructBuilder) + structBuilder.Append(true) + structBuilder.FieldBuilder(0).(*array.StringBuilder).Append("struct") + structBuilder.FieldBuilder(1).(*array.Int64Builder).Append(14) + + listBuilder := b.Field(10).(*array.ListBuilder) + listBuilder.Append(true) + listBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{1, 2}, nil) + largeListBuilder := b.Field(11).(*array.LargeListBuilder) + largeListBuilder.Append(true) + largeListBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{3, 4}, nil) + fixedListBuilder := b.Field(12).(*array.FixedSizeListBuilder) + fixedListBuilder.Append(true) + fixedListBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{5, 6}, nil) + listViewBuilder := b.Field(13).(*array.ListViewBuilder) + listViewBuilder.AppendWithSize(true, 2) + listViewBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{7, 8}, nil) + largeListViewBuilder := b.Field(14).(*array.LargeListViewBuilder) + largeListViewBuilder.AppendWithSize(true, 2) + largeListViewBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{9, 10}, nil) + mapBuilder := b.Field(15).(*array.MapBuilder) + mapBuilder.Append(true) + mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues([]string{"a", "b"}, nil) + mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues([]int64{11, 12}, nil) + b.Field(16).(*array.Int64Builder).Append(13) + return b.NewRecordBatch() +} + +func jsonPhysicalTypesExpected() map[string]any { + return map[string]any{ + "name": "json-physical-types", + "payload": map[string]any{"kind": "string"}, + "payload_large_string": map[string]any{"kind": "large_string"}, + "payload_string_view": map[string]any{"kind": "string_view"}, + "payload_binary": map[string]any{"kind": "binary"}, + "payload_large_binary": map[string]any{"kind": "large_binary"}, + "payload_binary_view": map[string]any{"kind": "binary_view"}, + "payload_struct": map[string]any{"kind": "struct", "count": float64(14)}, + "payload_list": []any{float64(1), float64(2)}, + "payload_large_list": []any{float64(3), float64(4)}, + "payload_fixed_size_list": []any{float64(5), float64(6)}, + "payload_list_view": []any{float64(7), float64(8)}, + "payload_large_list_view": []any{float64(9), float64(10)}, + "payload_map": map[string]any{"a": float64(11), "b": float64(12)}, + "payload_scalar": "13", + } +} + +func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, dsName string) { + t.Helper() + query := fmt.Sprintf(`{ + %s { + events(filter: {name: {eq: "json-physical-types"}}) { + name + %s + } + } + }`, dsName, strings.Join(jsonPhysicalTypeColumns, "\n")) + res, err := service.Query(context.Background(), query, nil) + require.NoError(t, err) + defer res.Close() + require.NoErrorf(t, res.Err(), "graphql error for query: %s", query) + + body, err := json.Marshal(res) + require.NoError(t, err) + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + data := payload["data"].(map[string]any) + root := data[dsName].(map[string]any) + rows := root["events"].([]any) + require.Len(t, rows, 1, "response: %s", string(body)) + assert.Equal(t, jsonPhysicalTypesExpected(), rows[0]) +} + // --- Tests ---------------------------------------------------------------- func TestIngest_Postgres_RoundTrip(t *testing.T) { @@ -318,6 +448,19 @@ func TestIngest_Postgres_RoundTrip(t *testing.T) { assert.Equal(t, []bool{true, false, true}, gotHasJSON) // beta has NULL payload } +func TestIngest_Postgres_JSONPhysicalTypes(t *testing.T) { + env := setupEnv(t) + rec := makeJSONPhysicalTypesRecord(t) + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.NoError(t, err) + assert.Equal(t, int64(1), res.Inserted) + expectedColumns := append([]string{"name", "value", "is_active"}, jsonPhysicalTypeColumns...) + assert.ElementsMatch(t, expectedColumns, res.Columns) + assertJSONPhysicalTypesReadThroughHugr(t, env.service, env.dsName) +} + func TestIngest_Postgres_PermissionData(t *testing.T) { env := setupEnv(t) diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index 0110f95b..9a607b06 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -12,6 +12,19 @@ CREATE TABLE events ( is_active BOOLEAN NOT NULL DEFAULT true, owner_id BIGINT, payload JSONB, + payload_large_string JSONB, + payload_string_view JSONB, + payload_binary JSONB, + payload_large_binary JSONB, + payload_binary_view JSONB, + payload_struct JSONB, + payload_list JSONB, + payload_large_list JSONB, + payload_fixed_size_list JSONB, + payload_list_view JSONB, + payload_large_list_view JSONB, + payload_map JSONB, + payload_scalar JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), geom GEOMETRY(Point, 0), geom_4326 GEOMETRY(Point, 4326), diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index dfe9807f..d8ecffa3 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -5,6 +5,19 @@ type events @table(name: "events") { is_active: Boolean! @default(value: "true") owner_id: BigInt payload: JSON + payload_large_string: JSON + payload_string_view: JSON + payload_binary: JSON + payload_large_binary: JSON + payload_binary_view: JSON + payload_struct: JSON + payload_list: JSON + payload_large_list: JSON + payload_fixed_size_list: JSON + payload_list_view: JSON + payload_large_list_view: JSON + payload_map: JSON + payload_scalar: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 0, type: POINT) geom_4326: Geometry @geometry_info(srid: 4326, type: POINT) diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index de68e53b..98e3983e 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -29,14 +29,15 @@ func (b *ArrowIngestStagingBuilder) FunctionCall(name string, positional []any, func arrowIngestJSONStagingExpr(arrowField arrow.Field, sourceExpr string) string { switch arrowField.Type.ID() { - case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW, - arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: return "try_cast(" + sourceExpr + " AS JSON)" + case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: + return "try_cast(decode(" + sourceExpr + ") AS JSON)" case arrow.STRUCT, arrow.LIST, arrow.LARGE_LIST, arrow.FIXED_SIZE_LIST, arrow.LIST_VIEW, arrow.LARGE_LIST_VIEW, arrow.MAP: return "to_json(" + sourceExpr + ")" default: - return sourceExpr + return "to_json(" + sourceExpr + ")" } } diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go index 0abd2e64..554a29bf 100644 --- a/pkg/engines/arrow_ingest_test.go +++ b/pkg/engines/arrow_ingest_test.go @@ -10,6 +10,38 @@ import ( "github.com/vektah/gqlparser/v2/ast" ) +func TestArrowIngestJSONStagingExpr(t *testing.T) { + tests := []struct { + name string + typ arrow.DataType + want string + }{ + {name: "string", typ: arrow.BinaryTypes.String, want: "try_cast(payload AS JSON)"}, + {name: "large string", typ: arrow.BinaryTypes.LargeString, want: "try_cast(payload AS JSON)"}, + {name: "string view", typ: arrow.BinaryTypes.StringView, want: "try_cast(payload AS JSON)"}, + {name: "binary", typ: arrow.BinaryTypes.Binary, want: "try_cast(decode(payload) AS JSON)"}, + {name: "large binary", typ: arrow.BinaryTypes.LargeBinary, want: "try_cast(decode(payload) AS JSON)"}, + {name: "binary view", typ: arrow.BinaryTypes.BinaryView, want: "try_cast(decode(payload) AS JSON)"}, + {name: "struct", typ: arrow.StructOf(), want: "to_json(payload)"}, + {name: "list", typ: arrow.ListOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "large list", typ: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "fixed size list", typ: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "list view", typ: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "large list view", typ: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "map", typ: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "scalar", typ: arrow.PrimitiveTypes.Int64, want: "to_json(payload)"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := arrowIngestJSONStagingExpr(arrow.Field{Name: "payload", Type: tt.typ}, "payload") + if got != tt.want { + t.Fatalf("got %q, want %q", got, tt.want) + } + }) + } +} + func TestDuckDBArrowIngestBuildsNativeGeoArrowSelectExpr(t *testing.T) { field := geometryTestField("") @@ -27,7 +59,7 @@ func TestDuckDBArrowIngestBuildsNativeGeoArrowSelectExpr(t *testing.T) { for _, tt := range tests { t.Run(tt.ext, func(t *testing.T) { - got, err := duckDBArrowIngestSelectExpr(field, arrow.Field{ + got, err := NewDuckDB().ArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: geoArrowTestType(tt.ext), Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), @@ -104,7 +136,7 @@ func TestDuckDBArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { if tt.ext != "" { meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) } - got, err := duckDBArrowIngestSelectExpr(field, arrow.Field{ + got, err := NewDuckDB().ArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: tt.typ, Metadata: meta, @@ -139,7 +171,7 @@ func TestPostgresArrowIngestBuildsNativeGeoArrowDirectSelectExpr(t *testing.T) { for _, tt := range tests { t.Run(tt.ext, func(t *testing.T) { - got, err := postgresArrowIngestSelectExpr(field, arrow.Field{ + got, err := NewPostgres().ArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: geoArrowTestType(tt.ext), Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), @@ -211,7 +243,7 @@ func TestPostgresArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { if tt.ext != "" { meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) } - got, err := postgresArrowIngestSelectExpr(field, arrow.Field{ + got, err := NewPostgres().ArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: tt.typ, Metadata: meta, @@ -233,7 +265,7 @@ func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { field := geometryTestField("") for _, ext := range []string{"geoarrow.geometry", "geoarrow.geometrycollection"} { t.Run(ext, func(t *testing.T) { - _, err := duckDBArrowIngestSelectExpr(field, arrow.Field{ + _, err := NewDuckDB().ArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: arrow.StructOf(), Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": ext}), @@ -264,7 +296,7 @@ func TestArrowIngestRejectsUnsupportedGeometryExtensionMetadata(t *testing.T) { }, } { t.Run(tt.name, func(t *testing.T) { - _, err := duckDBArrowIngestSelectExpr(field, arrow.Field{ + _, err := NewDuckDB().ArrowIngestSelectExpr(field, arrow.Field{ Name: "geom", Type: tt.typ, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index b6c7611f..c7960a26 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -90,7 +90,17 @@ func (e *DuckDB) Capabilities() *compiler.EngineCapabilities { } func (e *DuckDB) ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { - return duckDBArrowIngestSelectExpr(field, arrowField, sourceExpr) + if field == nil || field.Definition == nil { + return sourceExpr, nil + } + switch field.Definition.Type.Name() { + case base.JSONTypeName: + return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil + case base.GeometryTypeName: + return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) + default: + return sourceExpr, nil + } } func (e *DuckDB) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) { @@ -114,20 +124,6 @@ func (e *DuckDB) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, er return e.SQLValue(value) } -func duckDBArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { - if field == nil || field.Definition == nil { - return sourceExpr, nil - } - switch field.Definition.Type.Name() { - case base.JSONTypeName: - return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil - case base.GeometryTypeName: - return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) - default: - return sourceExpr, nil - } -} - func (e *DuckDB) FieldValueByPath(sqlName, path string) string { if path == "" { return sqlName diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 95661499..9d949fd3 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -598,15 +598,6 @@ func (e *Postgres) CastFromIntermediateType(f *ast.Field, toJSON bool) (string, } func (e *Postgres) ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { - return postgresArrowIngestSelectExpr(field, arrowField, sourceExpr) -} - -func (e *Postgres) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) { - var duckdb DuckDB - return duckdb.ArrowIngestLiteralExpr(field, value) -} - -func postgresArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { if field == nil || field.Definition == nil { return sourceExpr, nil } @@ -614,14 +605,15 @@ func postgresArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sou case base.JSONTypeName: return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil case base.GeometryTypeName: - return postgresArrowGeometryExpr(arrowField, sourceExpr) + return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) default: return sourceExpr, nil } } -func postgresArrowGeometryExpr(arrowField arrow.Field, sourceExpr string) (string, error) { - return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) +func (e *Postgres) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) { + var duckdb DuckDB + return duckdb.ArrowIngestLiteralExpr(field, value) } func pgRangeValueToSQLValue(v any) (string, error) { From 769c14f47f7fc565e5adba37c09611bec0a88fc0 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 21 Jun 2026 13:56:40 +0400 Subject: [PATCH 33/36] ipc ingest --- integration-test/ingest-duckdb/ingest_duckdb_test.go | 9 +++++++++ .../testdata/schemas/duck_ingest/schema.graphql | 1 + integration-test/ingest-postgres/ingest_postgres_test.go | 8 ++++++++ integration-test/ingest-postgres/testdata/init.sql | 1 + .../testdata/schemas/pg_ingest/schema.graphql | 1 + 5 files changed, 20 insertions(+) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index cf04a30f..8e1c1289 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -23,6 +23,7 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/extensions" "github.com/apache/arrow-go/v18/arrow/ipc" "github.com/apache/arrow-go/v18/arrow/memory" _ "github.com/duckdb/duckdb-go/v2" @@ -147,6 +148,7 @@ func setupEnv(t *testing.T) *ingestEnv { payload_large_list_view JSON, payload_map JSON, payload_scalar JSON, + payload_arrow_json JSON, created_at TIMESTAMPTZ NOT NULL DEFAULT now(), geom GEOMETRY, geom_wkt GEOMETRY, @@ -306,6 +308,7 @@ var jsonPhysicalTypeColumns = []string{ "payload_large_list_view", "payload_map", "payload_scalar", + "payload_arrow_json", } func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { @@ -315,6 +318,8 @@ func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { arrow.Field{Name: "kind", Type: arrow.BinaryTypes.String, Nullable: false}, arrow.Field{Name: "count", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, ) + arrowJSONType, err := extensions.NewJSONType(arrow.BinaryTypes.String) + require.NoError(t, err) schema := arrow.NewSchema([]arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, @@ -333,6 +338,7 @@ func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { {Name: "payload_large_list_view", Type: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, {Name: "payload_map", Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), Nullable: false}, {Name: "payload_scalar", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, + {Name: "payload_arrow_json", Type: arrowJSONType, Nullable: false}, }, nil) b := array.NewRecordBuilder(pool, schema) @@ -372,6 +378,8 @@ func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues([]string{"a", "b"}, nil) mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues([]int64{11, 12}, nil) b.Field(16).(*array.Int64Builder).Append(13) + arrowJSONBuilder := b.Field(17).(*array.ExtensionBuilder) + arrowJSONBuilder.StorageBuilder().(*array.StringBuilder).Append(`{"kind":"arrow_json"}`) return b.NewRecordBatch() } @@ -392,6 +400,7 @@ func jsonPhysicalTypesExpected() map[string]any { "payload_large_list_view": []any{float64(9), float64(10)}, "payload_map": map[string]any{"a": float64(11), "b": float64(12)}, "payload_scalar": "13", + "payload_arrow_json": map[string]any{"kind": "arrow_json"}, } } diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql index 4bca0a04..d5946e6a 100644 --- a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -18,6 +18,7 @@ type events @table(name: "events") { payload_large_list_view: JSON payload_map: JSON payload_scalar: JSON + payload_arrow_json: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 4326, type: POINT) geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 839977a6..6eec2a53 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -23,6 +23,7 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/extensions" "github.com/apache/arrow-go/v18/arrow/ipc" "github.com/apache/arrow-go/v18/arrow/memory" _ "github.com/jackc/pgx/v5/stdlib" @@ -280,6 +281,7 @@ var jsonPhysicalTypeColumns = []string{ "payload_large_list_view", "payload_map", "payload_scalar", + "payload_arrow_json", } func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { @@ -289,6 +291,8 @@ func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { arrow.Field{Name: "kind", Type: arrow.BinaryTypes.String, Nullable: false}, arrow.Field{Name: "count", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, ) + arrowJSONType, err := extensions.NewJSONType(arrow.BinaryTypes.String) + require.NoError(t, err) schema := arrow.NewSchema([]arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, @@ -307,6 +311,7 @@ func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { {Name: "payload_large_list_view", Type: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, {Name: "payload_map", Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), Nullable: false}, {Name: "payload_scalar", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, + {Name: "payload_arrow_json", Type: arrowJSONType, Nullable: false}, }, nil) b := array.NewRecordBuilder(pool, schema) @@ -346,6 +351,8 @@ func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues([]string{"a", "b"}, nil) mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues([]int64{11, 12}, nil) b.Field(16).(*array.Int64Builder).Append(13) + arrowJSONBuilder := b.Field(17).(*array.ExtensionBuilder) + arrowJSONBuilder.StorageBuilder().(*array.StringBuilder).Append(`{"kind":"arrow_json"}`) return b.NewRecordBatch() } @@ -366,6 +373,7 @@ func jsonPhysicalTypesExpected() map[string]any { "payload_large_list_view": []any{float64(9), float64(10)}, "payload_map": map[string]any{"a": float64(11), "b": float64(12)}, "payload_scalar": "13", + "payload_arrow_json": map[string]any{"kind": "arrow_json"}, } } diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index 9a607b06..ac9b352f 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -25,6 +25,7 @@ CREATE TABLE events ( payload_large_list_view JSONB, payload_map JSONB, payload_scalar JSONB, + payload_arrow_json JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), geom GEOMETRY(Point, 0), geom_4326 GEOMETRY(Point, 4326), diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index d8ecffa3..f3e1a197 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -18,6 +18,7 @@ type events @table(name: "events") { payload_large_list_view: JSON payload_map: JSON payload_scalar: JSON + payload_arrow_json: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 0, type: POINT) geom_4326: Geometry @geometry_info(srid: 4326, type: POINT) From fc03bee4c875a728f837315507b4157abce29d3a Mon Sep 17 00:00:00 2001 From: vadim Date: Tue, 23 Jun 2026 15:48:28 +0400 Subject: [PATCH 34/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 52 ++++++ .../ingest-postgres/docker-compose.yml | 6 + .../ingest-postgres/ingest_postgres_test.go | 94 ++++++++++- .../ingest-postgres/testdata/init.sql | 10 ++ .../testdata/schemas/pg_ingest/schema.graphql | 7 + pkg/engines/arrow_ingest.go | 54 +++++- pkg/engines/arrow_ingest_test.go | 159 +++--------------- pkg/engines/duckdb.go | 42 +---- pkg/engines/engines.go | 18 +- pkg/engines/postgres.go | 29 +--- pkg/planner/node_arrow_ingest.go | 33 ++-- 11 files changed, 259 insertions(+), 245 deletions(-) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index 8e1c1289..ab33158c 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -293,6 +293,33 @@ func makeEventsRecord(t *testing.T, names []string, values []float64, active []b return b.NewRecord() } +func makeMalformedJSONRecord(t *testing.T, binary bool) arrow.RecordBatch { + t.Helper() + payloadType := arrow.DataType(arrow.BinaryTypes.String) + payloadName := "payload" + if binary { + payloadType = arrow.BinaryTypes.Binary + payloadName = "payload_binary" + } + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: payloadName, Type: payloadType, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).Append("malformed-json") + b.Field(1).(*array.Float64Builder).Append(1) + b.Field(2).(*array.BooleanBuilder).Append(true) + if binary { + b.Field(3).(*array.BinaryBuilder).Append([]byte(`{"unterminated":`)) + } else { + b.Field(3).(*array.StringBuilder).Append(`{"unterminated":`) + } + return b.NewRecord() +} + var jsonPhysicalTypeColumns = []string{ "payload", "payload_large_string", @@ -502,6 +529,31 @@ func TestIngest_DuckDB_JSONPhysicalTypes(t *testing.T) { assertJSONPhysicalTypesReadThroughHugr(t, env.service, env.dsName) } +func TestIngest_DuckDB_RejectsMalformedJSON(t *testing.T) { + for _, tt := range []struct { + name string + binary bool + }{ + {name: "string"}, + {name: "binary", binary: true}, + } { + t.Run(tt.name, func(t *testing.T) { + env := setupEnv(t) + rec := makeMalformedJSONRecord(t, tt.binary) + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) + require.Error(t, err) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Zero(t, count, "a failed JSON cast must roll back the entire ingest") + }) + } +} + func TestIngest_DuckDB_PermissionData(t *testing.T) { env := setupEnv(t) diff --git a/integration-test/ingest-postgres/docker-compose.yml b/integration-test/ingest-postgres/docker-compose.yml index 65f6b3ac..dd148be5 100644 --- a/integration-test/ingest-postgres/docker-compose.yml +++ b/integration-test/ingest-postgres/docker-compose.yml @@ -1,6 +1,12 @@ services: postgres: image: postgis/postgis:16-3.4 + command: + - postgres + - -c + - logging_collector=on + - -c + - log_statement=all environment: POSTGRES_DB: ingestdb POSTGRES_USER: test diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 6eec2a53..0f51cd84 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -174,8 +174,8 @@ func setupEnv(t *testing.T) *ingestEnv { t.Skipf("%s not set — run integration-test/ingest-postgres/run.sh to spin up a postgres container", envPostgresDSN) } - // Truncate before each test to guarantee determinism (single shared table). - _, err := sharedPgConn.ExecContext(context.Background(), "TRUNCATE TABLE events RESTART IDENTITY") + // Truncate before each test to guarantee determinism. + _, err := sharedPgConn.ExecContext(context.Background(), "TRUNCATE TABLE events, binary_events RESTART IDENTITY") require.NoError(t, err) return &ingestEnv{ @@ -266,6 +266,33 @@ func makeEventsRecord(t *testing.T, names []string, values []float64, active []b return b.NewRecord() } +func makeMalformedJSONRecord(t *testing.T, binary bool) arrow.RecordBatch { + t.Helper() + payloadType := arrow.DataType(arrow.BinaryTypes.String) + payloadName := "payload" + if binary { + payloadType = arrow.BinaryTypes.Binary + payloadName = "payload_binary" + } + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: payloadName, Type: payloadType, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) + defer b.Release() + b.Field(0).(*array.StringBuilder).Append("malformed-json") + b.Field(1).(*array.Float64Builder).Append(1) + b.Field(2).(*array.BooleanBuilder).Append(true) + if binary { + b.Field(3).(*array.BinaryBuilder).Append([]byte(`{"unterminated":`)) + } else { + b.Field(3).(*array.StringBuilder).Append(`{"unterminated":`) + } + return b.NewRecord() +} + var jsonPhysicalTypeColumns = []string{ "payload", "payload_large_string", @@ -469,6 +496,69 @@ func TestIngest_Postgres_JSONPhysicalTypes(t *testing.T) { assertJSONPhysicalTypesReadThroughHugr(t, env.service, env.dsName) } +func TestIngest_Postgres_RejectsMalformedJSON(t *testing.T) { + for _, tt := range []struct { + name string + binary bool + }{ + {name: "string"}, + {name: "binary", binary: true}, + } { + t.Run(tt.name, func(t *testing.T) { + env := setupEnv(t) + rec := makeMalformedJSONRecord(t, tt.binary) + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.Error(t, err) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Zero(t, count, "a failed JSON cast must roll back the entire ingest") + }) + } +} + +func TestIngest_Postgres_UsesBinaryCopyWithoutTextOnlyTypes(t *testing.T) { + env := setupEnv(t) + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + { + Name: "geom", + Type: arrow.BinaryTypes.String, + Nullable: false, + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"}), + }, + }, nil) + b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) + b.Field(0).(*array.StringBuilder).Append("binary-copy") + b.Field(1).(*array.Float64Builder).Append(42) + b.Field(2).(*array.StringBuilder).Append("POINT (7.25 8.5)") + rec := b.NewRecord() + b.Release() + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "pg_ingest.binary_events", rec) + require.NoError(t, err) + assert.Equal(t, int64(1), res.Inserted) + + var name, geom string + require.NoError(t, env.pgConn.QueryRow( + "SELECT name, ST_AsText(geom) FROM binary_events", + ).Scan(&name, &geom)) + assert.Equal(t, "binary-copy", name) + assert.Equal(t, "POINT(7.25 8.5)", compactWKT(geom)) + + const copyPrefix = `COPY "public"."binary_events"` + var serverLog string + require.Eventually(t, func() bool { + err := env.pgConn.QueryRow("SELECT pg_read_file(pg_current_logfile())").Scan(&serverLog) + return err == nil && strings.Contains(serverLog, copyPrefix) && + strings.Contains(serverLog[strings.LastIndex(serverLog, copyPrefix):], "FORMAT BINARY") + }, 5*time.Second, 100*time.Millisecond, "postgres log did not contain binary COPY for binary_events") +} + func TestIngest_Postgres_PermissionData(t *testing.T) { env := setupEnv(t) diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index ac9b352f..46ae1632 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -41,3 +41,13 @@ CREATE TABLE events ( geom_multiline GEOMETRY(MultiLineString, 0), geom_multipolygon GEOMETRY(MultiPolygon, 0) ); + +-- This table intentionally contains only binary-COPY-compatible PostgreSQL +-- types. The integration suite uses it to verify that duckdb-postgres selects +-- FORMAT BINARY rather than falling back to text because of a JSONB column. +CREATE TABLE binary_events ( + id BIGSERIAL PRIMARY KEY, + name VARCHAR NOT NULL, + value DOUBLE PRECISION NOT NULL, + geom GEOMETRY(Point, 0) +); diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index f3e1a197..7656b17c 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -34,3 +34,10 @@ type events @table(name: "events") { geom_multiline: Geometry @geometry_info(srid: 0, type: MULTILINESTRING) geom_multipolygon: Geometry @geometry_info(srid: 0, type: MULTIPOLYGON) } + +type binary_events @table(name: "binary_events") { + id: BigInt! @pk @default(sequence: "binary_events_id_seq") + name: String! + value: Float! + geom: Geometry @geometry_info(srid: 0, type: POINT) +} diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index 98e3983e..b7ce84e4 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -1,16 +1,19 @@ package engines import ( + "encoding/hex" "fmt" "strings" "github.com/apache/arrow-go/v18/arrow" + "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" + ctypes "github.com/hugr-lab/query-engine/pkg/catalog/types" + "github.com/vektah/gqlparser/v2/ast" ) -// ArrowIngestStagingBuilder builds SQL fragments evaluated by DuckDB while an -// Arrow reader is registered as a temporary view. Target engines still decide -// how Arrow columns are shaped, but default/auth expression functions must be -// valid in this DuckDB staging SELECT. +// ArrowIngestStagingBuilder owns every SQL expression evaluated by DuckDB while +// an Arrow reader is registered as a view. Target-specific conversion, when a +// target needs one, is applied separately through EngineIngestTargetCaster. type ArrowIngestStagingBuilder struct { duckdb DuckDB } @@ -27,12 +30,51 @@ func (b *ArrowIngestStagingBuilder) FunctionCall(name string, positional []any, return b.duckdb.FunctionCall(name, positional, named) } +// SelectExpr converts an Arrow-view column to its canonical DuckDB staging +// representation for the target GraphQL field. +func (b *ArrowIngestStagingBuilder) SelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { + if field == nil || field.Definition == nil { + return sourceExpr, nil + } + switch field.Definition.Type.Name() { + case base.JSONTypeName: + return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil + case base.GeometryTypeName: + return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) + default: + return sourceExpr, nil + } +} + +// LiteralExpr converts a non-Arrow value, such as permission data, to a +// canonical DuckDB staging expression. +func (b *ArrowIngestStagingBuilder) LiteralExpr(field *ast.Field, value any) (string, error) { + if value == nil { + return "NULL", nil + } + if field != nil && field.Definition != nil && field.Definition.Type.Name() == base.GeometryTypeName { + geom, err := ctypes.ParseGeometryValue(value) + if err != nil { + return "", err + } + if geom == nil { + return "NULL", nil + } + wkbValue, err := ctypes.GeometryToSQLValue(geom) + if err != nil { + return "", err + } + return "ST_GeomFromWKB(from_hex('" + strings.ToUpper(hex.EncodeToString(wkbValue)) + "'))", nil + } + return b.duckdb.SQLValue(value) +} + func arrowIngestJSONStagingExpr(arrowField arrow.Field, sourceExpr string) string { switch arrowField.Type.ID() { case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: - return "try_cast(" + sourceExpr + " AS JSON)" + return "CAST(" + sourceExpr + " AS JSON)" case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: - return "try_cast(decode(" + sourceExpr + ") AS JSON)" + return "CAST(decode(" + sourceExpr + ") AS JSON)" case arrow.STRUCT, arrow.LIST, arrow.LARGE_LIST, arrow.FIXED_SIZE_LIST, arrow.LIST_VIEW, arrow.LARGE_LIST_VIEW, arrow.MAP: return "to_json(" + sourceExpr + ")" diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go index 554a29bf..2e8df3ec 100644 --- a/pkg/engines/arrow_ingest_test.go +++ b/pkg/engines/arrow_ingest_test.go @@ -16,12 +16,12 @@ func TestArrowIngestJSONStagingExpr(t *testing.T) { typ arrow.DataType want string }{ - {name: "string", typ: arrow.BinaryTypes.String, want: "try_cast(payload AS JSON)"}, - {name: "large string", typ: arrow.BinaryTypes.LargeString, want: "try_cast(payload AS JSON)"}, - {name: "string view", typ: arrow.BinaryTypes.StringView, want: "try_cast(payload AS JSON)"}, - {name: "binary", typ: arrow.BinaryTypes.Binary, want: "try_cast(decode(payload) AS JSON)"}, - {name: "large binary", typ: arrow.BinaryTypes.LargeBinary, want: "try_cast(decode(payload) AS JSON)"}, - {name: "binary view", typ: arrow.BinaryTypes.BinaryView, want: "try_cast(decode(payload) AS JSON)"}, + {name: "string", typ: arrow.BinaryTypes.String, want: "CAST(payload AS JSON)"}, + {name: "large string", typ: arrow.BinaryTypes.LargeString, want: "CAST(payload AS JSON)"}, + {name: "string view", typ: arrow.BinaryTypes.StringView, want: "CAST(payload AS JSON)"}, + {name: "binary", typ: arrow.BinaryTypes.Binary, want: "CAST(decode(payload) AS JSON)"}, + {name: "large binary", typ: arrow.BinaryTypes.LargeBinary, want: "CAST(decode(payload) AS JSON)"}, + {name: "binary view", typ: arrow.BinaryTypes.BinaryView, want: "CAST(decode(payload) AS JSON)"}, {name: "struct", typ: arrow.StructOf(), want: "to_json(payload)"}, {name: "list", typ: arrow.ListOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, {name: "large list", typ: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, @@ -42,8 +42,9 @@ func TestArrowIngestJSONStagingExpr(t *testing.T) { } } -func TestDuckDBArrowIngestBuildsNativeGeoArrowSelectExpr(t *testing.T) { +func TestArrowIngestStagingBuildsNativeGeoArrowSelectExpr(t *testing.T) { field := geometryTestField("") + staging := NewArrowIngestStagingBuilder() tests := []struct { ext string @@ -59,7 +60,7 @@ func TestDuckDBArrowIngestBuildsNativeGeoArrowSelectExpr(t *testing.T) { for _, tt := range tests { t.Run(tt.ext, func(t *testing.T) { - got, err := NewDuckDB().ArrowIngestSelectExpr(field, arrow.Field{ + got, err := staging.SelectExpr(field, arrow.Field{ Name: "geom", Type: geoArrowTestType(tt.ext), Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), @@ -79,8 +80,9 @@ func TestDuckDBArrowIngestBuildsNativeGeoArrowSelectExpr(t *testing.T) { } } -func TestDuckDBArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { +func TestArrowIngestStagingBuildsDirectGeometrySelectExpr(t *testing.T) { field := geometryTestField("") + staging := NewArrowIngestStagingBuilder() tests := []struct { name string @@ -136,7 +138,7 @@ func TestDuckDBArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { if tt.ext != "" { meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) } - got, err := NewDuckDB().ArrowIngestSelectExpr(field, arrow.Field{ + got, err := staging.SelectExpr(field, arrow.Field{ Name: "geom", Type: tt.typ, Metadata: meta, @@ -154,118 +156,12 @@ func TestDuckDBArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { } } -func TestPostgresArrowIngestBuildsNativeGeoArrowDirectSelectExpr(t *testing.T) { - field := geometryTestField("4326") - - tests := []struct { - ext string - want string - }{ - {"geoarrow.point", "ST_Point(struct_extract(geom, 'x'), struct_extract(geom, 'y'))"}, - {"geoarrow.linestring", "ST_MakeLine(list_transform(geom"}, - {"geoarrow.polygon", "ST_MakePolygon(ST_MakeLine(list_transform(geom[1]"}, - {"geoarrow.multipoint", "ST_Multi(ST_Collect(list_transform(geom"}, - {"geoarrow.multilinestring", "ST_Multi(ST_Collect(list_transform(geom"}, - {"geoarrow.multipolygon", "ST_Multi(ST_Collect(list_transform(geom"}, - } - - for _, tt := range tests { - t.Run(tt.ext, func(t *testing.T) { - got, err := NewPostgres().ArrowIngestSelectExpr(field, arrow.Field{ - Name: "geom", - Type: geoArrowTestType(tt.ext), - Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), - }, "geom") - if err != nil { - t.Fatal(err) - } - if got == "geom" { - t.Fatalf("expected explicit conversion, got raw column") - } - if !strings.Contains(got, tt.want) || - strings.Contains(got, "'SRID=4326;'") || - strings.Contains(got, "ST_AsText(") { - t.Fatalf("unexpected conversion for %s: %s", tt.ext, got) - } - }) - } -} - -func TestPostgresArrowIngestBuildsDirectGeometrySelectExpr(t *testing.T) { - field := geometryTestField("4326") - - tests := []struct { - name string - typ arrow.DataType - ext string - want string - }{ - { - name: "trusted geoarrow wkb is already materialized as geometry", - typ: arrow.BinaryTypes.Binary, - ext: "geoarrow.wkb", - want: "geom", - }, - { - name: "trusted geoarrow wkt parses directly from text", - typ: arrow.BinaryTypes.String, - ext: "geoarrow.wkt", - want: "ST_GeomFromText(geom, true)", - }, - { - name: "trusted geoarrow geojson parses directly from json", - typ: arrow.BinaryTypes.String, - ext: "geoarrow.geojson", - want: "ST_GeomFromGeoJSON(geom)", - }, - { - name: "trusted hugr geojson parses directly from json", - typ: arrow.BinaryTypes.String, - ext: "hugr.geojson", - want: "ST_GeomFromGeoJSON(geom)", - }, - { - name: "trusted plain geojson parses directly from json", - typ: arrow.BinaryTypes.String, - ext: "geojson", - want: "ST_GeomFromGeoJSON(geom)", - }, - { - name: "unannotated binary parses directly as wkb", - typ: arrow.BinaryTypes.Binary, - want: "ST_GeomFromWKB(geom)", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - meta := arrow.Metadata{} - if tt.ext != "" { - meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) - } - got, err := NewPostgres().ArrowIngestSelectExpr(field, arrow.Field{ - Name: "geom", - Type: tt.typ, - Metadata: meta, - }, "geom") - if err != nil { - t.Fatal(err) - } - if got != tt.want { - t.Fatalf("expected %s, got %s", tt.want, got) - } - if strings.Contains(got, "'SRID=4326;'") || strings.Contains(got, "ST_AsText(") { - t.Fatalf("expected direct geometry expression, got %s", got) - } - }) - } -} - func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { field := geometryTestField("") + staging := NewArrowIngestStagingBuilder() for _, ext := range []string{"geoarrow.geometry", "geoarrow.geometrycollection"} { t.Run(ext, func(t *testing.T) { - _, err := NewDuckDB().ArrowIngestSelectExpr(field, arrow.Field{ + _, err := staging.SelectExpr(field, arrow.Field{ Name: "geom", Type: arrow.StructOf(), Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": ext}), @@ -279,6 +175,7 @@ func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { func TestArrowIngestRejectsUnsupportedGeometryExtensionMetadata(t *testing.T) { field := geometryTestField("") + staging := NewArrowIngestStagingBuilder() for _, tt := range []struct { name string typ arrow.DataType @@ -296,7 +193,7 @@ func TestArrowIngestRejectsUnsupportedGeometryExtensionMetadata(t *testing.T) { }, } { t.Run(tt.name, func(t *testing.T) { - _, err := NewDuckDB().ArrowIngestSelectExpr(field, arrow.Field{ + _, err := staging.SelectExpr(field, arrow.Field{ Name: "geom", Type: tt.typ, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), @@ -311,10 +208,10 @@ func TestArrowIngestRejectsUnsupportedGeometryExtensionMetadata(t *testing.T) { } } -func TestPostgresArrowIngestLiteralExprUsesDuckDBStagingLiterals(t *testing.T) { - engine := &Postgres{} +func TestArrowIngestStagingLiteralExpr(t *testing.T) { + staging := NewArrowIngestStagingBuilder() - jsonSQL, err := engine.ArrowIngestLiteralExpr(nil, map[string]any{"status": "ok"}) + jsonSQL, err := staging.LiteralExpr(nil, map[string]any{"status": "ok"}) if err != nil { t.Fatal(err) } @@ -322,7 +219,7 @@ func TestPostgresArrowIngestLiteralExprUsesDuckDBStagingLiterals(t *testing.T) { t.Fatalf("expected DuckDB JSON literal, got %s", jsonSQL) } - geomSQL, err := engine.ArrowIngestLiteralExpr(geometryTestField("4326"), orb.Point{1, 2}) + geomSQL, err := staging.LiteralExpr(geometryTestField("4326"), orb.Point{1, 2}) if err != nil { t.Fatal(err) } @@ -330,21 +227,7 @@ func TestPostgresArrowIngestLiteralExprUsesDuckDBStagingLiterals(t *testing.T) { strings.Contains(geomSQL, "'SRID=4326;'") || strings.Contains(geomSQL, "ST_GeomFromText") || strings.Contains(geomSQL, "POINT") { - t.Fatalf("expected Postgres WKB geometry literal, got %s", geomSQL) - } -} - -func TestDuckDBArrowIngestLiteralExprUsesWKBStagingGeometry(t *testing.T) { - engine := &DuckDB{} - - geomSQL, err := engine.ArrowIngestLiteralExpr(geometryTestField(""), orb.Point{1, 2}) - if err != nil { - t.Fatal(err) - } - if !strings.Contains(geomSQL, "ST_GeomFromWKB(from_hex('0101000000") || - strings.Contains(geomSQL, "ST_GeomFromText") || - strings.Contains(geomSQL, "POINT") { - t.Fatalf("expected DuckDB WKB geometry literal, got %s", geomSQL) + t.Fatalf("expected canonical WKB geometry literal, got %s", geomSQL) } } diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index c7960a26..7722833a 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -2,14 +2,12 @@ package engines import ( "context" - "encoding/hex" "encoding/json" "fmt" "strconv" "strings" "time" - "github.com/apache/arrow-go/v18/arrow" "github.com/hugr-lab/query-engine/pkg/catalog/compiler" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" @@ -47,9 +45,8 @@ var scalarJSONInfo = map[string]jsonTypeInfo{ } var ( - _ Engine = &DuckDB{} - _ EngineArrowIngestCaster = &DuckDB{} - _ EngineAggregator = &DuckDB{} + _ Engine = &DuckDB{} + _ EngineAggregator = &DuckDB{} ) type DuckDB struct { @@ -89,41 +86,6 @@ func (e *DuckDB) Capabilities() *compiler.EngineCapabilities { } } -func (e *DuckDB) ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { - if field == nil || field.Definition == nil { - return sourceExpr, nil - } - switch field.Definition.Type.Name() { - case base.JSONTypeName: - return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil - case base.GeometryTypeName: - return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) - default: - return sourceExpr, nil - } -} - -func (e *DuckDB) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) { - if value == nil { - return "NULL", nil - } - if field != nil && field.Definition != nil && field.Definition.Type.Name() == base.GeometryTypeName { - geom, err := ctypes.ParseGeometryValue(value) - if err != nil { - return "", err - } - if geom == nil { - return "NULL", nil - } - wkbValue, err := ctypes.GeometryToSQLValue(geom) - if err != nil { - return "", err - } - return "ST_GeomFromWKB(from_hex('" + strings.ToUpper(hex.EncodeToString(wkbValue)) + "'))", nil - } - return e.SQLValue(value) -} - func (e *DuckDB) FieldValueByPath(sqlName, path string) string { if path == "" { return sqlName diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index 48f69225..e09f41d0 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -5,7 +5,6 @@ import ( "fmt" "strings" - "github.com/apache/arrow-go/v18/arrow" "github.com/hugr-lab/query-engine/pkg/catalog/compiler" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" @@ -73,21 +72,10 @@ type EngineTypeCaster interface { CastFromIntermediateType(field *ast.Field, toJSON bool) (string, error) } -type EngineArrowIngestCaster interface { - Engine - // ArrowIngestSelectExpr maps one Arrow-view column to a DuckDB staging SELECT - // expression using canonical DuckDB value types. - // Example: for a Geometry field, arrowField extension "geoarrow.geojson", and - // sourceExpr `geom_geojson`, this returns - // `ST_GeomFromGeoJSON(geom_geojson)` as a DuckDB GEOMETRY expression. - ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) - // ArrowIngestLiteralExpr returns a DuckDB-compatible literal/expression for - // non-Arrow values mixed into the ingest SELECT. - ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) -} - // EngineIngestTargetCaster is implemented by engines whose ingest target -// cannot consume canonical DuckDB staging values directly. +// cannot consume canonical DuckDB staging values directly. Engines that do not +// implement it explicitly accept canonical staging values as their ingest +// contract. type EngineIngestTargetCaster interface { Engine // CastIngestValueToTarget converts a DuckDB staging SELECT expression into diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 9d949fd3..099781e8 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -9,7 +9,6 @@ import ( "strings" "time" - "github.com/apache/arrow-go/v18/arrow" "github.com/hugr-lab/query-engine/pkg/catalog/compiler" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" @@ -21,11 +20,10 @@ import ( ) var ( - _ Engine = &Postgres{} - _ EngineQueryScanner = &Postgres{} - _ EngineTypeCaster = &Postgres{} - _ EngineArrowIngestCaster = &Postgres{} - _ EngineAggregator = &Postgres{} + _ Engine = &Postgres{} + _ EngineQueryScanner = &Postgres{} + _ EngineTypeCaster = &Postgres{} + _ EngineAggregator = &Postgres{} ) type Postgres struct { @@ -597,25 +595,6 @@ func (e *Postgres) CastFromIntermediateType(f *ast.Field, toJSON bool) (string, return Ident(f.Alias), nil } -func (e *Postgres) ArrowIngestSelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { - if field == nil || field.Definition == nil { - return sourceExpr, nil - } - switch field.Definition.Type.Name() { - case base.JSONTypeName: - return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil - case base.GeometryTypeName: - return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) - default: - return sourceExpr, nil - } -} - -func (e *Postgres) ArrowIngestLiteralExpr(field *ast.Field, value any) (string, error) { - var duckdb DuckDB - return duckdb.ArrowIngestLiteralExpr(field, value) -} - func pgRangeValueToSQLValue(v any) (string, error) { if v == nil { return "NULL", nil diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 9e395c46..3fe782d3 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -52,10 +52,6 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if !caps.Ingest.Available() { return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) } - ingestEngine, ok := engine.(engines.EngineArrowIngestCaster) - if !ok { - return nil, fmt.Errorf("engine %q declares IPC ingest support but does not implement Arrow ingest casting", engine.Type()) - } mutation := sdl.MutationInfo(ctx, provider, mutationField) if mutation == nil || mutation.Type != sdl.MutationTypeInsert { return nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) @@ -75,7 +71,7 @@ func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Cata if err := checkIngestPermissions(ctx, provider, info, columns, permissionData); err != nil { return nil, err } - return ingestNode(ctx, info, mutation, ingestEngine, columns, permissionData, source.View()), nil + return ingestNode(ctx, info, mutation, engine, columns, permissionData, source.View()), nil } func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { @@ -298,18 +294,19 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info // // - info is the GraphQL data object plus its DB table/column mapping. // - mutation is the GraphQL insert mutation used for insert defaults. -// - engine converts Arrow-view expressions into values accepted by the -// target engine/table. +// - engine describes the target and optionally adapts canonical DuckDB +// staging values through EngineIngestTargetCaster. // - columns are Arrow columns already resolved to GraphQL fields and DB // columns by resolveIngestColumns. // - permissionData contains extra GraphQL input values injected by the // permission layer; they do not come from the Arrow stream. // - arrowViewName is the globally unique DuckDB view registered from the // Arrow reader for this ingest execution. -func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.EngineArrowIngestCaster, columns []ingestColumn, permissionData map[string]any, arrowViewName string) *QueryPlanNode { +func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.Engine, columns []ingestColumn, permissionData map[string]any, arrowViewName string) *QueryPlanNode { return &QueryPlanNode{ Name: "ingest_" + info.Name, CollectFunc: func(node *QueryPlanNode, children Results, params []any) (string, []any, error) { + staging := engines.NewArrowIngestStagingBuilder() // fieldValues is keyed by GraphQL field name. Each value is a SQL // expression evaluated in the SELECT part of INSERT ... SELECT. // The expression may reference an Arrow column from the ingest @@ -321,13 +318,11 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e // is applied later when targetFields is built. value := engines.Ident(c.ArrowField.Name) // Synthetic GraphQL field used only to pass type/directive - // metadata to the engine-specific Arrow ingest caster. + // metadata to the staging and optional target casters. field := ingestASTField(info, c.Field, c.FieldDef) - // Build the Arrow ingest SELECT expression for the target - // GraphQL/DB field. - // Examples: JSON to_json(...) or DuckDB staging geometry - // expressions used for both DuckDB and attached Postgres targets. - value, err := engine.ArrowIngestSelectExpr(field, c.ArrowField, value) + // Normalize the Arrow value to a canonical DuckDB expression. + // Target-specific adaptation, if required, is applied below. + value, err := staging.SelectExpr(field, c.ArrowField, value) if err != nil { return "", nil, err } @@ -347,9 +342,9 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e if fieldDef == nil { return "", nil, fmt.Errorf("permission data field %q definition not found in data object %q", name, info.Name) } - // Unlike Arrow columns, this value has no Arrow type. The target - // engine still decides how to shape the literal for ingest. - sqlValue, err := engine.ArrowIngestLiteralExpr(ingestASTField(info, fieldInfo, fieldDef), value) + // Unlike Arrow columns, this value has no Arrow type. Build its + // canonical DuckDB literal before optional target adaptation. + sqlValue, err := staging.LiteralExpr(ingestASTField(info, fieldInfo, fieldDef), value) if err != nil { return "", nil, err } @@ -359,7 +354,7 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e // the Arrow view is registered in DuckDB. // Default/auth helper expressions must therefore use the same canonical // DuckDB staging types before optional target casting is applied below. - if err := mutation.AppendInsertSQLExpression(fieldValues, perm.AuthVars(ctx), engines.NewArrowIngestStagingBuilder()); err != nil { + if err := mutation.AppendInsertSQLExpression(fieldValues, perm.AuthVars(ctx), staging); err != nil { return "", nil, err } var targetFields, selectExprs []string @@ -414,7 +409,7 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e } } -func castIngestValueToTarget(engine engines.EngineArrowIngestCaster, field *ast.Field, stagingExpr string) (string, error) { +func castIngestValueToTarget(engine engines.Engine, field *ast.Field, stagingExpr string) (string, error) { targetCaster, ok := engine.(engines.EngineIngestTargetCaster) if !ok { return stagingExpr, nil From 53c839bcdb6598dcabc3bf8a33f7223ae381a92c Mon Sep 17 00:00:00 2001 From: vadim Date: Fri, 26 Jun 2026 01:24:08 +0400 Subject: [PATCH 35/36] ipc ingest --- .../ingest-postgres/ingest_postgres_test.go | 99 +++++++++++++++++++ .../ingest-postgres/testdata/init.sql | 11 +++ .../testdata/schemas/pg_ingest/schema.graphql | 6 ++ pkg/engines/arrow_ingest.go | 2 +- pkg/engines/engines.go | 12 +-- pkg/planner/node_arrow_ingest.go | 14 +-- pkg/planner/node_arrow_ingest_test.go | 20 ++-- 7 files changed, 140 insertions(+), 24 deletions(-) diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index 0f51cd84..cc0054d0 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -559,6 +559,105 @@ func TestIngest_Postgres_UsesBinaryCopyWithoutTextOnlyTypes(t *testing.T) { }, 5*time.Second, 100*time.Millisecond, "postgres log did not contain binary COPY for binary_events") } +// TestIngest_Postgres_GeometryEdgeCases verifies that the native +// DuckDB GEOMETRY -> PostGIS bridge faithfully carries geometries that the +// existing suite never exercised: SQL NULL, 3D (Z) coordinates, EMPTY +// geometries and a mixed GEOMETRYCOLLECTION. The target column is a bare +// `geometry` (no typmod) so PostGIS accepts any type/dimension and the +// assertions reflect exactly what crossed the bridge — not what a typmod +// coerced. Geometry is sent as geoarrow.wkt so DuckDB staging normalises it to +// a canonical GEOMETRY via ST_GeomFromText before the bridge writes it out. +func TestIngest_Postgres_GeometryEdgeCases(t *testing.T) { + env := setupEnv(t) + + _, err := env.pgConn.ExecContext(context.Background(), + "TRUNCATE TABLE geom_edge RESTART IDENTITY") + require.NoError(t, err) + + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + { + Name: "geom", + Type: arrow.BinaryTypes.String, + Nullable: true, + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"}), + }, + }, nil) + + b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) + names := b.Field(0).(*array.StringBuilder) + geoms := b.Field(1).(*array.StringBuilder) + + names.Append("a_null") + geoms.AppendNull() + names.Append("b_point_z") + geoms.Append("POINT Z (1 2 3)") + names.Append("c_empty_point") + geoms.Append("POINT EMPTY") + names.Append("d_geomcollection") + geoms.Append("GEOMETRYCOLLECTION(POINT(1 2),LINESTRING(0 0,1 1))") + + rec := b.NewRecord() + b.Release() + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "pg_ingest.geom_edge", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(4), res.Inserted) + + type edgeRow struct { + isNull bool + gtype string + zmflag int + isEmpty bool + numGeom int + } + rows, err := env.pgConn.Query(` + SELECT name, + geom IS NULL, + COALESCE(GeometryType(geom), ''), + COALESCE(ST_Zmflag(geom), -1), + COALESCE(ST_IsEmpty(geom), false), + COALESCE(ST_NumGeometries(geom), 0) + FROM geom_edge ORDER BY name`) + require.NoError(t, err) + defer rows.Close() + + got := map[string]edgeRow{} + for rows.Next() { + var name string + var r edgeRow + require.NoError(t, rows.Scan(&name, &r.isNull, &r.gtype, &r.zmflag, &r.isEmpty, &r.numGeom)) + got[name] = r + } + require.NoError(t, rows.Err()) + require.Len(t, got, 4) + + // NULL geometry must round-trip as SQL NULL. + assert.True(t, got["a_null"].isNull, "NULL geometry must stay NULL through the native bridge") + + // 3D point: the Z dimension must survive DuckDB GEOMETRY -> PostGIS. + assert.False(t, got["b_point_z"].isNull) + assert.Equal(t, "POINT", got["b_point_z"].gtype) + assert.Equal(t, 2, got["b_point_z"].zmflag, "ST_Zmflag 2 == XYZ (Z present, no M)") + + // EMPTY geometry must remain an empty geometry of the right type. + assert.Equal(t, "POINT", got["c_empty_point"].gtype) + assert.True(t, got["c_empty_point"].isEmpty, "POINT EMPTY must survive as empty") + + // Mixed GeometryCollection must keep its member count. + assert.Equal(t, "GEOMETRYCOLLECTION", got["d_geomcollection"].gtype) + assert.Equal(t, 2, got["d_geomcollection"].numGeom) + + // Exact coordinates for the 3D point. + var x, y, z float64 + require.NoError(t, env.pgConn.QueryRow( + "SELECT ST_X(geom), ST_Y(geom), ST_Z(geom) FROM geom_edge WHERE name = 'b_point_z'", + ).Scan(&x, &y, &z)) + assert.Equal(t, [3]float64{1, 2, 3}, [3]float64{x, y, z}) +} + func TestIngest_Postgres_PermissionData(t *testing.T) { env := setupEnv(t) diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index 46ae1632..dfd884a9 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -51,3 +51,14 @@ CREATE TABLE binary_events ( value DOUBLE PRECISION NOT NULL, geom GEOMETRY(Point, 0) ); + +-- Permissive geometry table for ingest edge-case coverage: NULL, 3D (Z), +-- EMPTY and GEOMETRYCOLLECTION values. The column is a bare `geometry` +-- (no type/SRID typmod) so it accepts whatever the native +-- DuckDB GEOMETRY -> PostGIS bridge produces, letting the test assert +-- whether the bridge preserves these non-trivial geometries faithfully. +CREATE TABLE geom_edge ( + id BIGSERIAL PRIMARY KEY, + name VARCHAR NOT NULL, + geom GEOMETRY +); diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index 7656b17c..3110c874 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -41,3 +41,9 @@ type binary_events @table(name: "binary_events") { value: Float! geom: Geometry @geometry_info(srid: 0, type: POINT) } + +type geom_edge @table(name: "geom_edge") { + id: BigInt! @pk @default(sequence: "geom_edge_id_seq") + name: String! + geom: Geometry @geometry_info(srid: 0) +} diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index b7ce84e4..135825af 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -13,7 +13,7 @@ import ( // ArrowIngestStagingBuilder owns every SQL expression evaluated by DuckDB while // an Arrow reader is registered as a view. Target-specific conversion, when a -// target needs one, is applied separately through EngineIngestTargetCaster. +// target needs one, is applied separately through EngineIngestValueAdapter. type ArrowIngestStagingBuilder struct { duckdb DuckDB } diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index e09f41d0..aa47f72a 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -72,15 +72,15 @@ type EngineTypeCaster interface { CastFromIntermediateType(field *ast.Field, toJSON bool) (string, error) } -// EngineIngestTargetCaster is implemented by engines whose ingest target -// cannot consume canonical DuckDB staging values directly. Engines that do not +// EngineIngestValueAdapter is implemented by engines whose ingest path cannot +// consume canonical DuckDB staging values directly. Engines that do not // implement it explicitly accept canonical staging values as their ingest // contract. -type EngineIngestTargetCaster interface { +type EngineIngestValueAdapter interface { Engine - // CastIngestValueToTarget converts a DuckDB staging SELECT expression into - // the representation accepted by the target source during ingest. - CastIngestValueToTarget(field *ast.Field, stagingExpr string) (string, error) + // AdaptIngestValueSQL adapts a DuckDB staging value SQL fragment to the + // representation expected by this engine/source during batch ingest. + AdaptIngestValueSQL(field *ast.Field, valueSQL string) (string, error) } type EngineVectorDistanceCalculator interface { diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go index 3fe782d3..ec381c6b 100644 --- a/pkg/planner/node_arrow_ingest.go +++ b/pkg/planner/node_arrow_ingest.go @@ -295,7 +295,7 @@ func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info // - info is the GraphQL data object plus its DB table/column mapping. // - mutation is the GraphQL insert mutation used for insert defaults. // - engine describes the target and optionally adapts canonical DuckDB -// staging values through EngineIngestTargetCaster. +// staging values through EngineIngestValueAdapter. // - columns are Arrow columns already resolved to GraphQL fields and DB // columns by resolveIngestColumns. // - permissionData contains extra GraphQL input values injected by the @@ -364,7 +364,7 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e targetFields = append(targetFields, c.Field.FieldSourceName("", true)) // selectExprs are evaluated from the DuckDB Arrow view and must // stay in the same order as targetFields. - expr, err := castIngestValueToTarget(engine, ingestASTField(info, c.Field, c.FieldDef), fieldValues[c.Field.Name]) + expr, err := adaptIngestValueSQL(engine, ingestASTField(info, c.Field, c.FieldDef), fieldValues[c.Field.Name]) if err != nil { return "", nil, err } @@ -385,7 +385,7 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e if fieldDef == nil { return "", nil, fmt.Errorf("ingest field %q definition not found in data object %q", fieldInfo.Name, info.Name) } - expr, err := castIngestValueToTarget(engine, ingestASTField(info, fieldInfo, fieldDef), expr) + expr, err := adaptIngestValueSQL(engine, ingestASTField(info, fieldInfo, fieldDef), expr) if err != nil { return "", nil, err } @@ -409,12 +409,12 @@ func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, e } } -func castIngestValueToTarget(engine engines.Engine, field *ast.Field, stagingExpr string) (string, error) { - targetCaster, ok := engine.(engines.EngineIngestTargetCaster) +func adaptIngestValueSQL(engine engines.Engine, field *ast.Field, valueSQL string) (string, error) { + adapter, ok := engine.(engines.EngineIngestValueAdapter) if !ok { - return stagingExpr, nil + return valueSQL, nil } - return targetCaster.CastIngestValueToTarget(field, stagingExpr) + return adapter.AdaptIngestValueSQL(field, valueSQL) } func ingestASTField(info *sdl.Object, fieldInfo *sdl.Field, fieldDef *ast.FieldDefinition) *ast.Field { diff --git a/pkg/planner/node_arrow_ingest_test.go b/pkg/planner/node_arrow_ingest_test.go index 87d8cdcf..315aa924 100644 --- a/pkg/planner/node_arrow_ingest_test.go +++ b/pkg/planner/node_arrow_ingest_test.go @@ -7,17 +7,17 @@ import ( "github.com/vektah/gqlparser/v2/ast" ) -type testIngestTargetCaster struct { +type testIngestValueAdapter struct { *engines.DuckDB } -func (e *testIngestTargetCaster) CastIngestValueToTarget(_ *ast.Field, stagingExpr string) (string, error) { - return "target_cast(" + stagingExpr + ")", nil +func (e *testIngestValueAdapter) AdaptIngestValueSQL(_ *ast.Field, valueSQL string) (string, error) { + return "adapted(" + valueSQL + ")", nil } -func TestCastIngestValueToTarget(t *testing.T) { +func TestAdaptIngestValueSQL(t *testing.T) { t.Run("direct target", func(t *testing.T) { - got, err := castIngestValueToTarget(engines.NewDuckDB(), nil, "staging_value") + got, err := adaptIngestValueSQL(engines.NewDuckDB(), nil, "staging_value") if err != nil { t.Fatal(err) } @@ -26,14 +26,14 @@ func TestCastIngestValueToTarget(t *testing.T) { } }) - t.Run("target caster", func(t *testing.T) { - engine := &testIngestTargetCaster{DuckDB: engines.NewDuckDB()} - got, err := castIngestValueToTarget(engine, nil, "staging_value") + t.Run("value adapter", func(t *testing.T) { + engine := &testIngestValueAdapter{DuckDB: engines.NewDuckDB()} + got, err := adaptIngestValueSQL(engine, nil, "staging_value") if err != nil { t.Fatal(err) } - if got != "target_cast(staging_value)" { - t.Fatalf("got %q, want target cast expression", got) + if got != "adapted(staging_value)" { + t.Fatalf("got %q, want adapted ingest expression", got) } }) } From 6a5c0e19e7c5dbaf2321c3cd1b79eaa743b74377 Mon Sep 17 00:00:00 2001 From: vadim Date: Sun, 28 Jun 2026 17:27:05 +0400 Subject: [PATCH 36/36] ipc ingest --- .../ingest-duckdb/ingest_duckdb_test.go | 900 ++++++++++----- .../schemas/duck_ingest/schema.graphql | 4 + .../ingest-postgres/ingest_postgres_test.go | 1008 +++++++++++------ .../ingest-postgres/testdata/init.sql | 4 + .../testdata/schemas/pg_ingest/schema.graphql | 4 + pkg/db/arrow_ingest_source.go | 6 +- pkg/engines/arrow_ingest.go | 344 ++++-- pkg/engines/arrow_ingest_test.go | 67 +- 8 files changed, 1619 insertions(+), 718 deletions(-) diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go index ab33158c..105a04b4 100644 --- a/integration-test/ingest-duckdb/ingest_duckdb_test.go +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -6,6 +6,7 @@ import ( "bytes" "context" "database/sql" + "encoding/hex" "encoding/json" "errors" "fmt" @@ -149,14 +150,18 @@ func setupEnv(t *testing.T) *ingestEnv { payload_map JSON, payload_scalar JSON, payload_arrow_json JSON, + payload_geo_point JSON, created_at TIMESTAMPTZ NOT NULL DEFAULT now(), geom GEOMETRY, geom_wkt GEOMETRY, - geom_geojson GEOMETRY, - geom_hugr_geojson GEOMETRY, - geom_plain_geojson GEOMETRY, - geom_wkb GEOMETRY, - geom_line GEOMETRY, + geom_geojson GEOMETRY, + geom_hugr_geojson GEOMETRY, + geom_plain_geojson GEOMETRY, + geom_geojson_struct GEOMETRY, + geom_geojson_arrow_json GEOMETRY, + geom_wkb GEOMETRY, + geom_hexwkb GEOMETRY, + geom_line GEOMETRY, geom_polygon_native GEOMETRY, geom_multipoint GEOMETRY, geom_multiline GEOMETRY, @@ -277,10 +282,10 @@ func makeEventsRecord(t *testing.T, names []string, values []float64, active []b }, nil) b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues(names, nil) - b.Field(1).(*array.Float64Builder).AppendValues(values, nil) - b.Field(2).(*array.BooleanBuilder).AppendValues(active, nil) - pBuilder := b.Field(3).(*array.StringBuilder) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues(names, nil) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).AppendValues(values, nil) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).AppendValues(active, nil) + pBuilder := recordFieldBuilder(t, b, "payload").(*array.StringBuilder) for _, p := range payload { if p == "" { pBuilder.AppendNull() @@ -288,9 +293,9 @@ func makeEventsRecord(t *testing.T, names []string, values []float64, active []b pBuilder.Append(p) } } - tsBuilder := b.Field(4).(*array.TimestampBuilder) + tsBuilder := recordFieldBuilder(t, b, "created_at").(*array.TimestampBuilder) tsBuilder.AppendValues(created, nil) - return b.NewRecord() + return b.NewRecordBatch() } func makeMalformedJSONRecord(t *testing.T, binary bool) arrow.RecordBatch { @@ -309,128 +314,239 @@ func makeMalformedJSONRecord(t *testing.T, binary bool) arrow.RecordBatch { }, nil) b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) defer b.Release() - b.Field(0).(*array.StringBuilder).Append("malformed-json") - b.Field(1).(*array.Float64Builder).Append(1) - b.Field(2).(*array.BooleanBuilder).Append(true) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("malformed-json") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(1) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(true) + payloadBuilder := recordFieldBuilder(t, b, payloadName) if binary { - b.Field(3).(*array.BinaryBuilder).Append([]byte(`{"unterminated":`)) + payloadBuilder.(*array.BinaryBuilder).Append([]byte(`{"unterminated":`)) } else { - b.Field(3).(*array.StringBuilder).Append(`{"unterminated":`) + payloadBuilder.(*array.StringBuilder).Append(`{"unterminated":`) } - return b.NewRecord() + return b.NewRecordBatch() } -var jsonPhysicalTypeColumns = []string{ - "payload", - "payload_large_string", - "payload_string_view", - "payload_binary", - "payload_large_binary", - "payload_binary_view", - "payload_struct", - "payload_list", - "payload_large_list", - "payload_fixed_size_list", - "payload_list_view", - "payload_large_list_view", - "payload_map", - "payload_scalar", - "payload_arrow_json", +func recordFieldBuilder(t *testing.T, b *array.RecordBuilder, name string) array.Builder { + t.Helper() + indices := b.Schema().FieldIndices(name) + require.Len(t, indices, 1, "arrow field %q must exist exactly once", name) + return b.Field(indices[0]) } -func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { +func mustRecordFieldBuilder(b *array.RecordBuilder, name string) array.Builder { + indices := b.Schema().FieldIndices(name) + if len(indices) != 1 { + panic(fmt.Sprintf("arrow field %q must exist exactly once", name)) + } + return b.Field(indices[0]) +} + +type eventsRecordBuilders struct { + names *array.StringBuilder + values *array.Float64Builder + active *array.BooleanBuilder + payloads *array.StringBuilder + createdAt *array.TimestampBuilder +} + +func eventsRecordBuildersFor(b *array.RecordBuilder) eventsRecordBuilders { + return eventsRecordBuilders{ + names: mustRecordFieldBuilder(b, "name").(*array.StringBuilder), + values: mustRecordFieldBuilder(b, "value").(*array.Float64Builder), + active: mustRecordFieldBuilder(b, "is_active").(*array.BooleanBuilder), + payloads: mustRecordFieldBuilder(b, "payload").(*array.StringBuilder), + createdAt: mustRecordFieldBuilder(b, "created_at").(*array.TimestampBuilder), + } +} + +type jsonPhysicalTypeSpec struct { + name string + dataType arrow.DataType + arrowExtension string + expected any + appendValue func(*testing.T, array.Builder) +} + +const ( + jsonStructKindField = iota + jsonStructCountField +) + +func jsonPhysicalTypeSpecs(t *testing.T) []jsonPhysicalTypeSpec { t.Helper() - pool := memory.NewGoAllocator() structType := arrow.StructOf( arrow.Field{Name: "kind", Type: arrow.BinaryTypes.String, Nullable: false}, arrow.Field{Name: "count", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, ) + geoPointType := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) arrowJSONType, err := extensions.NewJSONType(arrow.BinaryTypes.String) require.NoError(t, err) - schema := arrow.NewSchema([]arrow.Field{ + + return []jsonPhysicalTypeSpec{ + {name: "payload", dataType: arrow.BinaryTypes.String, expected: map[string]any{"kind": "string"}, appendValue: appendJSONText(`{"kind":"string"}`)}, + {name: "payload_large_string", dataType: arrow.BinaryTypes.LargeString, expected: map[string]any{"kind": "large_string"}, appendValue: appendJSONText(`{"kind":"large_string"}`)}, + {name: "payload_string_view", dataType: arrow.BinaryTypes.StringView, expected: map[string]any{"kind": "string_view"}, appendValue: appendJSONText(`{"kind":"string_view"}`)}, + {name: "payload_binary", dataType: arrow.BinaryTypes.Binary, expected: map[string]any{"kind": "binary"}, appendValue: appendJSONText(`{"kind":"binary"}`)}, + {name: "payload_large_binary", dataType: arrow.BinaryTypes.LargeBinary, expected: map[string]any{"kind": "large_binary"}, appendValue: appendJSONText(`{"kind":"large_binary"}`)}, + {name: "payload_binary_view", dataType: arrow.BinaryTypes.BinaryView, expected: map[string]any{"kind": "binary_view"}, appendValue: appendJSONText(`{"kind":"binary_view"}`)}, + {name: "payload_struct", dataType: structType, expected: map[string]any{"kind": "struct", "count": float64(14)}, appendValue: appendJSONStruct("struct", 14)}, + {name: "payload_list", dataType: arrow.ListOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(1), float64(2)}, appendValue: appendInt64JSONList(1, 2)}, + {name: "payload_large_list", dataType: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(3), float64(4)}, appendValue: appendInt64JSONList(3, 4)}, + {name: "payload_fixed_size_list", dataType: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), expected: []any{float64(5), float64(6)}, appendValue: appendInt64JSONList(5, 6)}, + {name: "payload_list_view", dataType: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(7), float64(8)}, appendValue: appendInt64JSONList(7, 8)}, + {name: "payload_large_list_view", dataType: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(9), float64(10)}, appendValue: appendInt64JSONList(9, 10)}, + {name: "payload_map", dataType: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), expected: map[string]any{"a": float64(11), "b": float64(12)}, appendValue: appendInt64JSONMap([]string{"a", "b"}, []int64{11, 12})}, + {name: "payload_scalar", dataType: arrow.PrimitiveTypes.Int64, expected: "13", appendValue: appendInt64JSONScalar(13)}, + {name: "payload_arrow_json", dataType: arrowJSONType, expected: map[string]any{"kind": "arrow_json"}, appendValue: appendArrowJSONText(`{"kind":"arrow_json"}`)}, + {name: "payload_geo_point", dataType: geoPointType, arrowExtension: "geoarrow.point", expected: geoJSONGeometry("Point", pointCoordinate(xyPoint{x: 30.5, y: 50.25})), appendValue: appendGeoArrowJSONPoint(xyPoint{x: 30.5, y: 50.25})}, + } +} + +func jsonPhysicalTypeColumns(t *testing.T) []string { + t.Helper() + specs := jsonPhysicalTypeSpecs(t) + columns := make([]string, 0, len(specs)) + for _, spec := range specs { + columns = append(columns, spec.name) + } + return columns +} + +func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + specs := jsonPhysicalTypeSpecs(t) + fields := []arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, - {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: false}, - {Name: "payload_large_string", Type: arrow.BinaryTypes.LargeString, Nullable: false}, - {Name: "payload_string_view", Type: arrow.BinaryTypes.StringView, Nullable: false}, - {Name: "payload_binary", Type: arrow.BinaryTypes.Binary, Nullable: false}, - {Name: "payload_large_binary", Type: arrow.BinaryTypes.LargeBinary, Nullable: false}, - {Name: "payload_binary_view", Type: arrow.BinaryTypes.BinaryView, Nullable: false}, - {Name: "payload_struct", Type: structType, Nullable: false}, - {Name: "payload_list", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_large_list", Type: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_fixed_size_list", Type: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_list_view", Type: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_large_list_view", Type: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_map", Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_scalar", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, - {Name: "payload_arrow_json", Type: arrowJSONType, Nullable: false}, - }, nil) + } + for _, spec := range specs { + field := arrow.Field{Name: spec.name, Type: spec.dataType, Nullable: false} + if spec.arrowExtension != "" { + field.Metadata = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": spec.arrowExtension}) + } + fields = append(fields, field) + } + schema := arrow.NewSchema(fields, nil) b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).Append("json-physical-types") - b.Field(1).(*array.Float64Builder).Append(1) - b.Field(2).(*array.BooleanBuilder).Append(true) - b.Field(3).(*array.StringBuilder).Append(`{"kind":"string"}`) - b.Field(4).(*array.LargeStringBuilder).Append(`{"kind":"large_string"}`) - b.Field(5).(*array.StringViewBuilder).Append(`{"kind":"string_view"}`) - b.Field(6).(*array.BinaryBuilder).Append([]byte(`{"kind":"binary"}`)) - b.Field(7).(*array.BinaryBuilder).Append([]byte(`{"kind":"large_binary"}`)) - b.Field(8).(*array.BinaryViewBuilder).Append([]byte(`{"kind":"binary_view"}`)) - - structBuilder := b.Field(9).(*array.StructBuilder) - structBuilder.Append(true) - structBuilder.FieldBuilder(0).(*array.StringBuilder).Append("struct") - structBuilder.FieldBuilder(1).(*array.Int64Builder).Append(14) - - listBuilder := b.Field(10).(*array.ListBuilder) - listBuilder.Append(true) - listBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{1, 2}, nil) - largeListBuilder := b.Field(11).(*array.LargeListBuilder) - largeListBuilder.Append(true) - largeListBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{3, 4}, nil) - fixedListBuilder := b.Field(12).(*array.FixedSizeListBuilder) - fixedListBuilder.Append(true) - fixedListBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{5, 6}, nil) - listViewBuilder := b.Field(13).(*array.ListViewBuilder) - listViewBuilder.AppendWithSize(true, 2) - listViewBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{7, 8}, nil) - largeListViewBuilder := b.Field(14).(*array.LargeListViewBuilder) - largeListViewBuilder.AppendWithSize(true, 2) - largeListViewBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{9, 10}, nil) - mapBuilder := b.Field(15).(*array.MapBuilder) - mapBuilder.Append(true) - mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues([]string{"a", "b"}, nil) - mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues([]int64{11, 12}, nil) - b.Field(16).(*array.Int64Builder).Append(13) - arrowJSONBuilder := b.Field(17).(*array.ExtensionBuilder) - arrowJSONBuilder.StorageBuilder().(*array.StringBuilder).Append(`{"kind":"arrow_json"}`) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("json-physical-types") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(1) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(true) + for _, spec := range specs { + spec.appendValue(t, recordFieldBuilder(t, b, spec.name)) + } return b.NewRecordBatch() } -func jsonPhysicalTypesExpected() map[string]any { - return map[string]any{ - "name": "json-physical-types", - "payload": map[string]any{"kind": "string"}, - "payload_large_string": map[string]any{"kind": "large_string"}, - "payload_string_view": map[string]any{"kind": "string_view"}, - "payload_binary": map[string]any{"kind": "binary"}, - "payload_large_binary": map[string]any{"kind": "large_binary"}, - "payload_binary_view": map[string]any{"kind": "binary_view"}, - "payload_struct": map[string]any{"kind": "struct", "count": float64(14)}, - "payload_list": []any{float64(1), float64(2)}, - "payload_large_list": []any{float64(3), float64(4)}, - "payload_fixed_size_list": []any{float64(5), float64(6)}, - "payload_list_view": []any{float64(7), float64(8)}, - "payload_large_list_view": []any{float64(9), float64(10)}, - "payload_map": map[string]any{"a": float64(11), "b": float64(12)}, - "payload_scalar": "13", - "payload_arrow_json": map[string]any{"kind": "arrow_json"}, +func appendJSONText(value string) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + switch b := builder.(type) { + case *array.StringBuilder: + b.Append(value) + case *array.LargeStringBuilder: + b.Append(value) + case *array.StringViewBuilder: + b.Append(value) + case *array.BinaryBuilder: + b.Append([]byte(value)) + case *array.BinaryViewBuilder: + b.Append([]byte(value)) + default: + require.Failf(t, "unsupported JSON text builder", "got %T", builder) + } + } +} + +func appendJSONStruct(kind string, count int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + structBuilder, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + structBuilder.Append(true) + structBuilder.FieldBuilder(jsonStructKindField).(*array.StringBuilder).Append(kind) + structBuilder.FieldBuilder(jsonStructCountField).(*array.Int64Builder).Append(count) + } +} + +func appendInt64JSONList(values ...int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + switch b := builder.(type) { + case *array.ListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.LargeListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.FixedSizeListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.ListViewBuilder: + b.AppendWithSize(true, len(values)) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.LargeListViewBuilder: + b.AppendWithSize(true, len(values)) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + default: + require.Failf(t, "unsupported JSON list builder", "got %T", builder) + } + } +} + +func appendInt64JSONMap(keys []string, values []int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + mapBuilder, ok := builder.(*array.MapBuilder) + require.Truef(t, ok, "got %T, want *array.MapBuilder", builder) + mapBuilder.Append(true) + mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues(keys, nil) + mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues(values, nil) } } +func appendInt64JSONScalar(value int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + intBuilder, ok := builder.(*array.Int64Builder) + require.Truef(t, ok, "got %T, want *array.Int64Builder", builder) + intBuilder.Append(value) + } +} + +func appendArrowJSONText(value string) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + extensionBuilder, ok := builder.(*array.ExtensionBuilder) + require.Truef(t, ok, "got %T, want *array.ExtensionBuilder", builder) + extensionBuilder.StorageBuilder().(*array.StringBuilder).Append(value) + } +} + +func appendGeoArrowJSONPoint(point xyPoint) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + structBuilder, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + appendPoint(structBuilder, point) + } +} + +func jsonPhysicalTypesExpected(t *testing.T) map[string]any { + t.Helper() + expected := map[string]any{"name": "json-physical-types"} + for _, spec := range jsonPhysicalTypeSpecs(t) { + expected[spec.name] = spec.expected + } + return expected +} + func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, dsName string) { t.Helper() query := fmt.Sprintf(`{ @@ -440,7 +556,7 @@ func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, %s } } - }`, dsName, strings.Join(jsonPhysicalTypeColumns, "\n")) + }`, dsName, strings.Join(jsonPhysicalTypeColumns(t), "\n")) res, err := service.Query(context.Background(), query, nil) require.NoError(t, err) defer res.Close() @@ -454,7 +570,7 @@ func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, root := data[dsName].(map[string]any) rows := root["events"].([]any) require.Len(t, rows, 1, "response: %s", string(body)) - assert.Equal(t, jsonPhysicalTypesExpected(), rows[0]) + assert.Equal(t, jsonPhysicalTypesExpected(t), rows[0]) } // --- Core tests ----------------------------------------------------------- @@ -524,7 +640,7 @@ func TestIngest_DuckDB_JSONPhysicalTypes(t *testing.T) { res, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) require.NoError(t, err) assert.Equal(t, int64(1), res.Inserted) - expectedColumns := append([]string{"name", "value", "is_active"}, jsonPhysicalTypeColumns...) + expectedColumns := append([]string{"name", "value", "is_active"}, jsonPhysicalTypeColumns(t)...) assert.ElementsMatch(t, expectedColumns, res.Columns) assertJSONPhysicalTypesReadThroughHugr(t, env.service, env.dsName) } @@ -665,9 +781,9 @@ func TestIngest_DuckDB_UnknownColumn(t *testing.T) { }, nil) b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues([]string{"x"}, nil) - b.Field(1).(*array.Int32Builder).AppendValues([]int32{1}, nil) - rec := b.NewRecord() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues([]string{"x"}, nil) + recordFieldBuilder(t, b, "not_a_column").(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecordBatch() defer rec.Release() _, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) @@ -690,8 +806,8 @@ func TestIngest_DuckDB_UnknownDataObject(t *testing.T) { }, nil) b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.Int32Builder).AppendValues([]int32{1}, nil) - rec := b.NewRecord() + recordFieldBuilder(t, b, "x").(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecordBatch() defer rec.Release() _, err := env.client.IngestRecord(context.Background(), env.dsName+".does_not_exist", rec) @@ -712,24 +828,25 @@ func TestIngest_DuckDB_MultipleBatches(t *testing.T) { mk := func(names []string) arrow.RecordBatch { b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + fields := eventsRecordBuildersFor(b) + fields.names.AppendValues(names, nil) vals := make([]float64, len(names)) for i := range vals { vals[i] = float64(i) } - b.Field(1).(*array.Float64Builder).AppendValues(vals, nil) + fields.values.AppendValues(vals, nil) active := make([]bool, len(names)) for i := range active { active[i] = true } - b.Field(2).(*array.BooleanBuilder).AppendValues(active, nil) - b.Field(3).(*array.StringBuilder).AppendNulls(len(names)) + fields.active.AppendValues(active, nil) + fields.payloads.AppendNulls(len(names)) ts := make([]arrow.Timestamp, len(names)) for i := range ts { ts[i] = arrow.Timestamp(time.Now().UTC().UnixMicro()) } - b.Field(4).(*array.TimestampBuilder).AppendValues(ts, nil) - return b.NewRecord() + fields.createdAt.AppendValues(ts, nil) + return b.NewRecordBatch() } rec1 := mk([]string{"a", "b"}) defer rec1.Release() @@ -849,10 +966,10 @@ func TestIngest_DuckDB_Stream(t *testing.T) { {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, }, nil) b := array.NewRecordBuilder(pool, schema) - b.Field(0).(*array.StringBuilder).AppendValues([]string{"s1", "s2"}, nil) - b.Field(1).(*array.Float64Builder).AppendValues([]float64{10, 20}, nil) - b.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true, false}, nil) - rec := b.NewRecord() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues([]string{"s1", "s2"}, nil) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).AppendValues([]float64{10, 20}, nil) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).AppendValues([]bool{true, false}, nil) + rec := b.NewRecordBatch() b.Release() defer rec.Release() @@ -913,6 +1030,7 @@ func TestIngest_DuckDB_ArrowIPCFile_StreamFormat(t *testing.T) { var count int require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) assert.Equal(t, totalRows, count) + assertArrowIPCFileGeometry(t, env, ro, namePrefix, totalRows) t.Logf("arrow ipc stream file ingest: %d rows from %d-batch file in %s (%.0f rows/s)", totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) @@ -948,11 +1066,29 @@ func TestIngest_DuckDB_ArrowIPCFile_FileFormat(t *testing.T) { var count int require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) assert.Equal(t, totalRows, count) + assertArrowIPCFileGeometry(t, env, ro, namePrefix, totalRows) t.Logf("arrow ipc file-format ingest: %d rows from %d-batch file in %s (%.0f rows/s)", totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } +func assertArrowIPCFileGeometry(t *testing.T, env *ingestEnv, ro *sql.DB, namePrefix string, totalRows int) { + t.Helper() + _, err := ro.Exec("LOAD spatial") + require.NoError(t, err) + + lastName, lastPoint := geometryBatchRow(namePrefix, totalRows-1) + values := scanGeometryValues(t, ro.QueryRow(fmt.Sprintf(` + SELECT %s + FROM events + WHERE name = ? + `, geometrySelectList()), lastName)) + assert.Equal(t, geometryExpected(pointWKT(lastPoint), coord(lastPoint.x), coord(lastPoint.y)), values) + assertGeometryReadThroughHugr(t, env.service, env.dsName, fmt.Sprintf(`filter: { name: { eq: "%s" } }`, lastName), []map[string]any{ + geometryReadExpected(lastName, lastPoint, lastPoint.x, lastPoint.y), + }) +} + func TestIngest_DuckDB_ArrowIPCFile_NotFound(t *testing.T) { env := setupEnv(t) _, err := env.client.IngestArrowIPCFile(context.Background(), env.dataObject, @@ -1015,8 +1151,8 @@ func TestIngest_LazyReader_Termination_DuckDB(t *testing.T) { mk := func(v int32) arrow.RecordBatch { b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.Int32Builder).Append(v) - return b.NewRecord() + recordFieldBuilder(t, b, "x").(*array.Int32Builder).Append(v) + return b.NewRecordBatch() } // gen returns batches then nil → clean end-of-stream. @@ -1105,10 +1241,10 @@ func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, }, nil) bld := array.NewRecordBuilder(pool, schema) - bld.Field(0).(*array.StringBuilder).AppendValues([]string{"direct"}, nil) - bld.Field(1).(*array.Float64Builder).AppendValues([]float64{42}, nil) - bld.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true}, nil) - rec := bld.NewRecord() + recordFieldBuilder(t, bld, "name").(*array.StringBuilder).AppendValues([]string{"direct"}, nil) + recordFieldBuilder(t, bld, "value").(*array.Float64Builder).AppendValues([]float64{42}, nil) + recordFieldBuilder(t, bld, "is_active").(*array.BooleanBuilder).AppendValues([]bool{true}, nil) + rec := bld.NewRecordBatch() bld.Release() defer rec.Release() @@ -1187,7 +1323,10 @@ func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { env := setupEnv(t) - rec, schema := makeGeometryTypesRecord(t, []string{"geo-a", "geo-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + rec, schema := makeGeometryTypesRecord(t, []geometryTypesRow{ + {name: "geo-a", value: 1, active: true, point: xyPoint{x: 30.5, y: 50.25}, shapeOrigin: xyPoint{x: 0, y: 0}}, + {name: "geo-b", value: 2, active: true, point: xyPoint{x: -73.935242, y: 40.730610}, shapeOrigin: xyPoint{x: 1, y: 1}}, + }) defer rec.Release() var buf bytes.Buffer @@ -1212,32 +1351,19 @@ func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { _, err = ro.Exec("LOAD spatial") require.NoError(t, err) - rows, err := ro.Query(` + rows, err := ro.Query(fmt.Sprintf(` SELECT name, - ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), - ST_AsText(geom_hugr_geojson), ST_AsText(geom_plain_geojson), - ST_AsText(geom_wkb), - ST_AsText(geom_line), ST_AsText(geom_polygon_native), ST_AsText(geom_multipoint), - ST_AsText(geom_multiline), ST_AsText(geom_multipolygon) + %s FROM events - WHERE name LIKE 'geo-%' + WHERE name LIKE 'geo-%%' ORDER BY name - `) + `, geometrySelectList())) require.NoError(t, err) defer rows.Close() got := map[string][]string{} for rows.Next() { - var name string - values := make([]string, 11) - scanArgs := []any{&name} - for i := range values { - scanArgs = append(scanArgs, &values[i]) - } - require.NoError(t, rows.Scan(scanArgs...)) - for i := range values { - values[i] = compactWKT(values[i]) - } + name, values := scanNamedGeometryValues(t, rows) got[name] = values } require.NoError(t, rows.Err()) @@ -1250,7 +1376,10 @@ func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { func TestIngest_HTTP_GeometryTypes_ReadThroughHugr_DuckDB(t *testing.T) { env := setupEnv(t) - rec, schema := makeGeometryTypesRecord(t, []string{"geo-read-a", "geo-read-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + rec, schema := makeGeometryTypesRecord(t, []geometryTypesRow{ + {name: "geo-read-a", value: 1, active: true, point: xyPoint{x: 30.5, y: 50.25}, shapeOrigin: xyPoint{x: 0, y: 0}}, + {name: "geo-read-b", value: 2, active: true, point: xyPoint{x: -73.935242, y: 40.730610}, shapeOrigin: xyPoint{x: 1, y: 1}}, + }) defer rec.Release() var buf bytes.Buffer @@ -1266,8 +1395,8 @@ func TestIngest_HTTP_GeometryTypes_ReadThroughHugr_DuckDB(t *testing.T) { require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { like: "geo-read-%" } }`, []map[string]any{ - geometryReadExpected("geo-read-a", [2]float64{30.5, 50.25}, 0, 0), - geometryReadExpected("geo-read-b", [2]float64{-73.935242, 40.730610}, 1, 1), + geometryReadExpected("geo-read-a", xyPoint{x: 30.5, y: 50.25}, 0, 0), + geometryReadExpected("geo-read-b", xyPoint{x: -73.935242, y: 40.730610}, 1, 1), }) } @@ -1290,7 +1419,7 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k_DuckDB(t *testing.T) { w := ipc.NewWriter(pw, ipc.WithSchema(schema)) var streamErr error for batchIdx := 0; batchIdx < numBatches; batchIdx++ { - rec := buildGeometryTypesBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix) + rec := buildGeometryTypesBatch(t, pool, schema, batchIdx, rowsPerBatch, namePrefix) if err := w.Write(rec); err != nil { streamErr = fmt.Errorf("write geometry batch %d: %w", batchIdx, err) rec.Release() @@ -1328,22 +1457,14 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k_DuckDB(t *testing.T) { require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'dk-geo-bulk-%'").Scan(&count)) assert.Equal(t, totalRows, count) - values := make([]string, 11) - require.NoError(t, ro.QueryRow(` - SELECT ST_AsText(geom), ST_AsText(geom_wkt), ST_AsText(geom_geojson), - ST_AsText(geom_hugr_geojson), ST_AsText(geom_plain_geojson), - ST_AsText(geom_wkb), - ST_AsText(geom_line), ST_AsText(geom_polygon_native), ST_AsText(geom_multipoint), - ST_AsText(geom_multiline), ST_AsText(geom_multipolygon) + values := scanGeometryValues(t, ro.QueryRow(fmt.Sprintf(` + SELECT %s FROM events WHERE name = 'dk-geo-bulk-049999' - `).Scan(&values[0], &values[1], &values[2], &values[3], &values[4], &values[5], &values[6], &values[7], &values[8], &values[9], &values[10])) - for i := range values { - values[i] = compactWKT(values[i]) - } + `, geometrySelectList()))) assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { eq: "dk-geo-bulk-049999" } }`, []map[string]any{ - geometryReadExpected("dk-geo-bulk-049999", [2]float64{99, 49}, 99, 49), + geometryReadExpected("dk-geo-bulk-049999", xyPoint{x: 99, y: 49}, 99, 49), }) elapsed := time.Since(start) @@ -1370,71 +1491,190 @@ func eventsArrowSchema() *arrow.Schema { }, nil) } -func makeGeometryTypesRecord(t *testing.T, names []string, points [][2]float64) (arrow.RecordBatch, *arrow.Schema) { +func eventsArrowFileSchema() *arrow.Schema { + fields := append([]arrow.Field{}, eventsArrowSchema().Fields()...) + fields = append(fields, geometryArrowFields()...) + return arrow.NewSchema(fields, nil) +} + +type geometryTypesRow struct { + name string + value float64 + active bool + point xyPoint + shapeOrigin xyPoint +} + +func makeGeometryTypesRecord(t *testing.T, rows []geometryTypesRow) (arrow.RecordBatch, *arrow.Schema) { t.Helper() - require.Len(t, points, len(names)) schema := geometryTypesSchema() pool := memory.NewGoAllocator() b := array.NewRecordBuilder(pool, schema) defer b.Release() - for i, name := range names { - appendGeometryTypesRow(b, name, float64(i+1), true, points[i], float64(i), float64(i)) + for _, row := range rows { + appendGeometryTypesRow(t, b, row) } return b.NewRecordBatch(), schema } func geometryTypesSchema() *arrow.Schema { + fields := []arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + } + fields = append(fields, geometryArrowFields()...) + return arrow.NewSchema(fields, nil) +} + +func geometryArrowFields() []arrow.Field { pointType := arrow.StructOf( arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, ) lineType := arrow.ListOf(pointType) polygonType := arrow.ListOf(lineType) - return arrow.NewSchema([]arrow.Field{ - {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, - {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, - {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, - {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, - {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, - {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, - {Name: "geom_hugr_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "hugr.geojson"})}, - {Name: "geom_plain_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geojson"})}, - {Name: "geom_wkb", Type: arrow.BinaryTypes.Binary, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkb"})}, - {Name: "geom_line", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.linestring"})}, - {Name: "geom_polygon_native", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.polygon"})}, - {Name: "geom_multipoint", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multipoint"})}, - {Name: "geom_multiline", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multilinestring"})}, - {Name: "geom_multipolygon", Type: arrow.ListOf(polygonType), Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multipolygon"})}, - }, nil) + fields := make([]arrow.Field, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + field := arrow.Field{ + Name: col.name, + Type: col.arrowType, + Nullable: false, + } + if col.arrowExtension != "" { + field.Metadata = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": col.arrowExtension}) + } + fields = append(fields, field) + } + return fields +} + +type geometryValueColumn struct { + name string + arrowType arrow.DataType + arrowExtension string + expectedWKT func(point, x, y string) string +} + +func geometryValueColumns(pointType, lineType, polygonType arrow.DataType) []geometryValueColumn { + geoJSONStructType := arrow.StructOf( + arrow.Field{Name: "type", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "coordinates", Type: arrow.ListOf(arrow.ListOf(arrow.ListOf(arrow.PrimitiveTypes.Float64))), Nullable: false}, + ) + line := func(_ string, x string, y string) string { + return fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)) + } + polygon := func(_ string, x string, y string) string { return polygonWKT(x, y) } + point := func(point string, _ string, _ string) string { return point } + multiPoint := func(_ string, x string, y string) string { + return fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y) + } + multiLine := func(_ string, x string, y string) string { + return fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)) + } + multiPolygon := func(_ string, x string, y string) string { return multiPolygonWKT(x, y) } + + return []geometryValueColumn{ + {name: "geom", arrowType: pointType, arrowExtension: "geoarrow.point", expectedWKT: point}, + {name: "geom_wkt", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.wkt", expectedWKT: line}, + {name: "geom_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.geojson", expectedWKT: polygon}, + {name: "geom_hugr_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "hugr.geojson", expectedWKT: polygon}, + {name: "geom_plain_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "geojson", expectedWKT: polygon}, + {name: "geom_geojson_struct", arrowType: geoJSONStructType, expectedWKT: polygon}, + {name: "geom_geojson_arrow_json", arrowType: mustArrowJSONType(), arrowExtension: "arrow.json", expectedWKT: polygon}, + {name: "geom_wkb", arrowType: arrow.BinaryTypes.Binary, arrowExtension: "geoarrow.wkb", expectedWKT: point}, + {name: "geom_hexwkb", arrowType: arrow.BinaryTypes.String, arrowExtension: "hugr.hexwkb", expectedWKT: point}, + {name: "geom_line", arrowType: lineType, arrowExtension: "geoarrow.linestring", expectedWKT: line}, + {name: "geom_polygon_native", arrowType: polygonType, arrowExtension: "geoarrow.polygon", expectedWKT: polygon}, + {name: "geom_multipoint", arrowType: lineType, arrowExtension: "geoarrow.multipoint", expectedWKT: multiPoint}, + {name: "geom_multiline", arrowType: polygonType, arrowExtension: "geoarrow.multilinestring", expectedWKT: multiLine}, + {name: "geom_multipolygon", arrowType: arrow.ListOf(polygonType), arrowExtension: "geoarrow.multipolygon", expectedWKT: multiPolygon}, + } +} + +func mustArrowJSONType() arrow.DataType { + typ, err := extensions.NewJSONType(arrow.BinaryTypes.String) + if err != nil { + panic(err) + } + return typ } func geometryTypesColumns() []string { - return []string{ - "name", "value", "is_active", - "geom", "geom_wkt", "geom_geojson", - "geom_hugr_geojson", "geom_plain_geojson", "geom_wkb", - "geom_line", "geom_polygon_native", "geom_multipoint", - "geom_multiline", "geom_multipolygon", + pointType, lineType, polygonType := geometryArrowTypes() + columns := []string{"name", "value", "is_active"} + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + columns = append(columns, col.name) } + return columns } func geometryExpected(point, x, y string) []string { - return []string{ - point, - fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), - polygonWKT(x, y), - polygonWKT(x, y), - polygonWKT(x, y), - point, - fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)), - polygonWKT(x, y), - fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y), - fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)), - multiPolygonWKT(x, y), + pointType, lineType, polygonType := geometryArrowTypes() + values := make([]string, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + values = append(values, col.expectedWKT(point, x, y)) + } + return values +} + +func geometryArrowTypes() (pointType, lineType, polygonType arrow.DataType) { + pointType = arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + lineType = arrow.ListOf(pointType) + polygonType = arrow.ListOf(lineType) + return pointType, lineType, polygonType +} + +func geometrySelectList() string { + pointType, lineType, polygonType := geometryArrowTypes() + exprs := make([]string, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + exprs = append(exprs, "ST_AsText("+col.name+")") + } + return strings.Join(exprs, ",\n") +} + +type sqlScanner interface { + Scan(dest ...any) error +} + +func scanGeometryValues(t *testing.T, scanner sqlScanner) []string { + t.Helper() + pointType, lineType, polygonType := geometryArrowTypes() + columns := geometryValueColumns(pointType, lineType, polygonType) + values := make([]string, len(columns)) + scanArgs := make([]any, 0, len(columns)) + for i := range columns { + scanArgs = append(scanArgs, &values[i]) + } + require.NoError(t, scanner.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) } + return values +} + +func scanNamedGeometryValues(t *testing.T, rows *sql.Rows) (string, []string) { + t.Helper() + pointType, lineType, polygonType := geometryArrowTypes() + columns := geometryValueColumns(pointType, lineType, polygonType) + var name string + values := make([]string, len(columns)) + scanArgs := []any{&name} + for i := range columns { + scanArgs = append(scanArgs, &values[i]) + } + require.NoError(t, rows.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + return name, values } func polygonWKT(x, y string) string { @@ -1484,7 +1724,10 @@ func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, geom_geojson geom_hugr_geojson geom_plain_geojson + geom_geojson_struct + geom_geojson_arrow_json geom_wkb + geom_hexwkb geom_line geom_polygon_native geom_multipoint @@ -1520,20 +1763,23 @@ func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, assert.Equal(t, expected, got) } -func geometryReadExpected(name string, point [2]float64, x, y float64) map[string]any { +func geometryReadExpected(name string, point xyPoint, x, y float64) map[string]any { return map[string]any{ - "name": name, - "geom": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), - "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), - "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), - "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), - "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), - "geom_wkb": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), - "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), - "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), - "geom_multipoint": geoJSONGeometry("MultiPoint", pointCoordinates(multiPoints(x, y))), - "geom_multiline": geoJSONGeometry("MultiLineString", nestedPointCoordinates(multiLines(x, y))), - "geom_multipolygon": geoJSONGeometry("MultiPolygon", deepPointCoordinates(multiPolygons(x, y))), + "name": name, + "geom": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_geojson_struct": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_geojson_arrow_json": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_wkb": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_hexwkb": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_multipoint": geoJSONGeometry("MultiPoint", pointCoordinates(multiPoints(x, y))), + "geom_multiline": geoJSONGeometry("MultiLineString", nestedPointCoordinates(multiLines(x, y))), + "geom_multipolygon": geoJSONGeometry("MultiPolygon", deepPointCoordinates(multiPolygons(x, y))), } } @@ -1545,7 +1791,7 @@ func geoJSONGeometry(typ string, coordinates any) map[string]any { } func pointCoordinate(point xyPoint) []any { - return []any{point[0], point[1]} + return []any{point.x, point.y} } func pointCoordinates(points []xyPoint) []any { @@ -1590,48 +1836,105 @@ func compactWKT(s string) string { return s } -func buildGeometryTypesBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { +func buildGeometryTypesBatch(t *testing.T, pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { + t.Helper() b := array.NewRecordBuilder(pool, schema) defer b.Release() for i := 0; i < rowsPerBatch; i++ { row := batchIdx*rowsPerBatch + i - x := float64(row % 100) - y := float64(row / 1000) - appendGeometryTypesRow(b, fmt.Sprintf("%s-%06d", namePrefix, row), float64(row)*0.5, row%2 == 0, [2]float64{x, y}, x, y) + name, point := geometryBatchRow(namePrefix, row) + appendGeometryTypesRow(t, b, geometryTypesRow{ + name: name, + value: float64(row) * 0.5, + active: row%2 == 0, + point: point, + shapeOrigin: point, + }) } return b.NewRecordBatch() } -func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, active bool, point [2]float64, shapeX, shapeY float64) { - b.Field(0).(*array.StringBuilder).Append(name) - b.Field(1).(*array.Float64Builder).Append(value) - b.Field(2).(*array.BooleanBuilder).Append(active) +func geometryBatchRow(namePrefix string, row int) (string, xyPoint) { + return fmt.Sprintf("%s-%06d", namePrefix, row), xyPoint{ + x: float64(row % 100), + y: float64(row / 1000), + } +} - sb := b.Field(3).(*array.StructBuilder) - sb.Append(true) - sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) - sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) +func appendGeometryTypesRow(t *testing.T, b *array.RecordBuilder, row geometryTypesRow) { + t.Helper() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append(row.name) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(row.value) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(row.active) + appendGeometryValueFields(t, b, row) +} + +func appendGeometryValueFields(t *testing.T, b *array.RecordBuilder, row geometryTypesRow) { + t.Helper() + x, y := row.shapeOrigin.x, row.shapeOrigin.y + + appendPoint(recordFieldBuilder(t, b, "geom").(*array.StructBuilder), row.point) + recordFieldBuilder(t, b, "geom_wkt").(*array.StringBuilder).Append(lineWKT(x, y)) + recordFieldBuilder(t, b, "geom_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + recordFieldBuilder(t, b, "geom_hugr_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + recordFieldBuilder(t, b, "geom_plain_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + appendGeoJSONPolygonStruct(t, recordFieldBuilder(t, b, "geom_geojson_struct"), x, y) + recordFieldBuilder(t, b, "geom_geojson_arrow_json").(*array.ExtensionBuilder).StorageBuilder().(*array.StringBuilder).Append(polygonGeoJSON(x, y)) - b.Field(4).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) - b.Field(5).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) - b.Field(6).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) - b.Field(7).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) - wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) - b.Field(8).(*array.BinaryBuilder).Append(wkbPoint) - appendPointList(b.Field(9).(*array.ListBuilder), linePoints(shapeX, shapeY)) - appendPointListList(b.Field(10).(*array.ListBuilder), polygonRings(shapeX, shapeY)) - appendPointList(b.Field(11).(*array.ListBuilder), multiPoints(shapeX, shapeY)) - appendPointListList(b.Field(12).(*array.ListBuilder), multiLines(shapeX, shapeY)) - appendPointListListList(b.Field(13).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) + wkbPoint, err := wkb.Marshal(orb.Point{row.point.x, row.point.y}) + require.NoError(t, err) + recordFieldBuilder(t, b, "geom_wkb").(*array.BinaryBuilder).Append(wkbPoint) + recordFieldBuilder(t, b, "geom_hexwkb").(*array.StringBuilder).Append(strings.ToUpper(hex.EncodeToString(wkbPoint))) + appendPointList(recordFieldBuilder(t, b, "geom_line").(*array.ListBuilder), linePoints(x, y)) + appendPointListList(recordFieldBuilder(t, b, "geom_polygon_native").(*array.ListBuilder), polygonRings(x, y)) + appendPointList(recordFieldBuilder(t, b, "geom_multipoint").(*array.ListBuilder), multiPoints(x, y)) + appendPointListList(recordFieldBuilder(t, b, "geom_multiline").(*array.ListBuilder), multiLines(x, y)) + appendPointListListList(recordFieldBuilder(t, b, "geom_multipolygon").(*array.ListBuilder), multiPolygons(x, y)) } -type xyPoint [2]float64 +type xyPoint struct { + x float64 + y float64 +} + +const ( + geoArrowPointXField = iota + geoArrowPointYField +) + +const ( + geoJSONGeometryTypeField = iota + geoJSONGeometryCoordinatesField +) func appendPoint(sb *array.StructBuilder, point xyPoint) { sb.Append(true) - sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) - sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + sb.FieldBuilder(geoArrowPointXField).(*array.Float64Builder).Append(point.x) + sb.FieldBuilder(geoArrowPointYField).(*array.Float64Builder).Append(point.y) +} + +func appendGeoJSONPolygonStruct(t *testing.T, builder array.Builder, x, y float64) { + t.Helper() + sb, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + + sb.Append(true) + sb.FieldBuilder(geoJSONGeometryTypeField).(*array.StringBuilder).Append("Polygon") + appendGeoJSONPolygonCoordinates(sb.FieldBuilder(geoJSONGeometryCoordinatesField).(*array.ListBuilder), polygonRings(x, y)) +} + +func appendGeoJSONPolygonCoordinates(lb *array.ListBuilder, rings [][]xyPoint) { + lb.Append(true) + ringBuilder := lb.ValueBuilder().(*array.ListBuilder) + for _, ring := range rings { + ringBuilder.Append(true) + pointBuilder := ringBuilder.ValueBuilder().(*array.ListBuilder) + for _, point := range ring { + pointBuilder.Append(true) + pointBuilder.ValueBuilder().(*array.Float64Builder).AppendValues([]float64{point.x, point.y}, nil) + } + } } func appendPointList(lb *array.ListBuilder, points []xyPoint) { @@ -1659,31 +1962,31 @@ func appendPointListListList(lb *array.ListBuilder, polygons [][][]xyPoint) { } func linePoints(x, y float64) []xyPoint { - return []xyPoint{{x, y}, {x + 1, y + 1}, {x + 2, y + 1}} + return []xyPoint{{x: x, y: y}, {x: x + 1, y: y + 1}, {x: x + 2, y: y + 1}} } func polygonRings(x, y float64) [][]xyPoint { return [][]xyPoint{ - {{x, y}, {x, y + 4}, {x + 4, y + 4}, {x + 4, y}, {x, y}}, - {{x + 1, y + 1}, {x + 2, y + 1}, {x + 2, y + 2}, {x + 1, y + 2}, {x + 1, y + 1}}, + {{x: x, y: y}, {x: x, y: y + 4}, {x: x + 4, y: y + 4}, {x: x + 4, y: y}, {x: x, y: y}}, + {{x: x + 1, y: y + 1}, {x: x + 2, y: y + 1}, {x: x + 2, y: y + 2}, {x: x + 1, y: y + 2}, {x: x + 1, y: y + 1}}, } } func multiPoints(x, y float64) []xyPoint { - return []xyPoint{{x, y}, {x + 1, y + 1}, {x + 2, y}} + return []xyPoint{{x: x, y: y}, {x: x + 1, y: y + 1}, {x: x + 2, y: y}} } func multiLines(x, y float64) [][]xyPoint { return [][]xyPoint{ - {{x, y}, {x + 1, y + 1}}, - {{x + 2, y + 2}, {x + 3, y + 3}}, + {{x: x, y: y}, {x: x + 1, y: y + 1}}, + {{x: x + 2, y: y + 2}, {x: x + 3, y: y + 3}}, } } func multiPolygons(x, y float64) [][][]xyPoint { return [][][]xyPoint{ polygonRings(x, y), - {{{x + 10, y + 10}, {x + 10, y + 12}, {x + 12, y + 12}, {x + 12, y + 10}, {x + 10, y + 10}}}, + {{{x: x + 10, y: y + 10}, {x: x + 10, y: y + 12}, {x: x + 12, y: y + 12}, {x: x + 12, y: y + 10}, {x: x + 10, y: y + 10}}}, } } @@ -1694,6 +1997,10 @@ func lineWKT(x, y float64) string { coord(x+2), coord(y+1)) } +func pointWKT(point xyPoint) string { + return fmt.Sprintf("POINT(%s %s)", coord(point.x), coord(point.y)) +} + func polygonGeoJSON(x, y float64) string { return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]],[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, coord(x), coord(y), @@ -1718,61 +2025,88 @@ func coord(v float64) string { func buildEventsBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string, base time.Time) arrow.RecordBatch { rb := array.NewRecordBuilder(pool, schema) defer rb.Release() - names := rb.Field(0).(*array.StringBuilder) - values := rb.Field(1).(*array.Float64Builder) - active := rb.Field(2).(*array.BooleanBuilder) - payloads := rb.Field(3).(*array.StringBuilder) - ts := rb.Field(4).(*array.TimestampBuilder) + fields := eventsRecordBuildersFor(rb) for i := 0; i < rowsPerBatch; i++ { row := batchIdx*rowsPerBatch + i - names.Append(fmt.Sprintf("%s-%06d", namePrefix, row)) - values.Append(float64(row) * 0.5) - active.Append(row%2 == 0) + fields.names.Append(fmt.Sprintf("%s-%06d", namePrefix, row)) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) if row%5 == 0 { - payloads.AppendNull() + fields.payloads.AppendNull() } else { - payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) } - ts.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) } - return rb.NewRecord() + return rb.NewRecordBatch() } -// writeEventsArrowFile writes an Arrow IPC file (stream or file format) at -// path with `numBatches * rowsPerBatch` rows for the events schema. -func writeEventsArrowFile(t *testing.T, path, namePrefix string, format arrowFileFormat, numBatches, rowsPerBatch int) { - t.Helper() - pool := memory.NewGoAllocator() - schema := eventsArrowSchema() +type arrowIPCRecordWriter interface { + Write(arrow.RecordBatch) error + Close() error +} - f, err := os.Create(path) - require.NoError(t, err) - defer f.Close() +func newArrowIPCRecordWriter(t *testing.T, f *os.File, schema *arrow.Schema, format arrowFileFormat) arrowIPCRecordWriter { + t.Helper() - type writer interface { - Write(arrow.RecordBatch) error - Close() error - } - var w writer switch format { case arrowStreamFormat: - w = ipc.NewWriter(f, ipc.WithSchema(schema)) + return ipc.NewWriter(f, ipc.WithSchema(schema)) case arrowFileFmt: - fw, ferr := ipc.NewFileWriter(f, ipc.WithSchema(schema)) - require.NoError(t, ferr) - w = fw + w, err := ipc.NewFileWriter(f, ipc.WithSchema(schema)) + require.NoError(t, err) + return w default: t.Fatalf("unknown arrow file format: %d", format) + return nil } +} - base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) +func writeArrowIPCFile(t *testing.T, path string, schema *arrow.Schema, format arrowFileFormat, numBatches int, buildBatch func(batchIdx int) arrow.RecordBatch) { + t.Helper() + + f, err := os.Create(path) + require.NoError(t, err) + defer f.Close() + + w := newArrowIPCRecordWriter(t, f, schema, format) for batchIdx := 0; batchIdx < numBatches; batchIdx++ { - rec := buildEventsBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix, base) + rec := buildBatch(batchIdx) require.NoError(t, w.Write(rec)) rec.Release() } require.NoError(t, w.Close()) } +// writeEventsArrowFile writes an Arrow IPC file (stream or file format) at +// path with `numBatches * rowsPerBatch` rows for the events schema. +func writeEventsArrowFile(t *testing.T, path, namePrefix string, format arrowFileFormat, numBatches, rowsPerBatch int) { + t.Helper() + pool := memory.NewGoAllocator() + schema := eventsArrowFileSchema() + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + writeArrowIPCFile(t, path, schema, format, numBatches, func(batchIdx int) arrow.RecordBatch { + rb := array.NewRecordBuilder(pool, schema) + defer rb.Release() + fields := eventsRecordBuildersFor(rb) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + name, point := geometryBatchRow(namePrefix, row) + fields.names.Append(name) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) + if row%5 == 0 { + fields.payloads.AppendNull() + } else { + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + appendGeometryValueFields(t, rb, geometryTypesRow{point: point, shapeOrigin: point}) + } + return rb.NewRecordBatch() + }) +} + // Silence "imported and not used" if a refactor leaves a quoted ref around. var _ atomic.Int64 diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql index d5946e6a..31394d6d 100644 --- a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -19,13 +19,17 @@ type events @table(name: "events") { payload_map: JSON payload_scalar: JSON payload_arrow_json: JSON + payload_geo_point: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 4326, type: POINT) geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) geom_hugr_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) geom_plain_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_geojson_struct: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_geojson_arrow_json: Geometry @geometry_info(srid: 4326, type: POLYGON) geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) + geom_hexwkb: Geometry @geometry_info(srid: 4326, type: POINT) geom_line: Geometry @geometry_info(srid: 4326, type: LINESTRING) geom_polygon_native: Geometry @geometry_info(srid: 4326, type: POLYGON) geom_multipoint: Geometry @geometry_info(srid: 4326, type: MULTIPOINT) diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go index cc0054d0..7f6f0aa7 100644 --- a/integration-test/ingest-postgres/ingest_postgres_test.go +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -6,6 +6,7 @@ import ( "bytes" "context" "database/sql" + "encoding/hex" "encoding/json" "errors" "fmt" @@ -250,10 +251,10 @@ func makeEventsRecord(t *testing.T, names []string, values []float64, active []b }, nil) b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues(names, nil) - b.Field(1).(*array.Float64Builder).AppendValues(values, nil) - b.Field(2).(*array.BooleanBuilder).AppendValues(active, nil) - pBuilder := b.Field(3).(*array.StringBuilder) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues(names, nil) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).AppendValues(values, nil) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).AppendValues(active, nil) + pBuilder := recordFieldBuilder(t, b, "payload").(*array.StringBuilder) for _, p := range payload { if p == "" { pBuilder.AppendNull() @@ -261,9 +262,9 @@ func makeEventsRecord(t *testing.T, names []string, values []float64, active []b pBuilder.Append(p) } } - tsBuilder := b.Field(4).(*array.TimestampBuilder) + tsBuilder := recordFieldBuilder(t, b, "created_at").(*array.TimestampBuilder) tsBuilder.AppendValues(created, nil) - return b.NewRecord() + return b.NewRecordBatch() } func makeMalformedJSONRecord(t *testing.T, binary bool) arrow.RecordBatch { @@ -282,128 +283,239 @@ func makeMalformedJSONRecord(t *testing.T, binary bool) arrow.RecordBatch { }, nil) b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) defer b.Release() - b.Field(0).(*array.StringBuilder).Append("malformed-json") - b.Field(1).(*array.Float64Builder).Append(1) - b.Field(2).(*array.BooleanBuilder).Append(true) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("malformed-json") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(1) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(true) + payloadBuilder := recordFieldBuilder(t, b, payloadName) if binary { - b.Field(3).(*array.BinaryBuilder).Append([]byte(`{"unterminated":`)) + payloadBuilder.(*array.BinaryBuilder).Append([]byte(`{"unterminated":`)) } else { - b.Field(3).(*array.StringBuilder).Append(`{"unterminated":`) + payloadBuilder.(*array.StringBuilder).Append(`{"unterminated":`) } - return b.NewRecord() + return b.NewRecordBatch() } -var jsonPhysicalTypeColumns = []string{ - "payload", - "payload_large_string", - "payload_string_view", - "payload_binary", - "payload_large_binary", - "payload_binary_view", - "payload_struct", - "payload_list", - "payload_large_list", - "payload_fixed_size_list", - "payload_list_view", - "payload_large_list_view", - "payload_map", - "payload_scalar", - "payload_arrow_json", +func recordFieldBuilder(t *testing.T, b *array.RecordBuilder, name string) array.Builder { + t.Helper() + indices := b.Schema().FieldIndices(name) + require.Len(t, indices, 1, "arrow field %q must exist exactly once", name) + return b.Field(indices[0]) } -func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { +func mustRecordFieldBuilder(b *array.RecordBuilder, name string) array.Builder { + indices := b.Schema().FieldIndices(name) + if len(indices) != 1 { + panic(fmt.Sprintf("arrow field %q must exist exactly once", name)) + } + return b.Field(indices[0]) +} + +type eventsRecordBuilders struct { + names *array.StringBuilder + values *array.Float64Builder + active *array.BooleanBuilder + payloads *array.StringBuilder + createdAt *array.TimestampBuilder +} + +func eventsRecordBuildersFor(b *array.RecordBuilder) eventsRecordBuilders { + return eventsRecordBuilders{ + names: mustRecordFieldBuilder(b, "name").(*array.StringBuilder), + values: mustRecordFieldBuilder(b, "value").(*array.Float64Builder), + active: mustRecordFieldBuilder(b, "is_active").(*array.BooleanBuilder), + payloads: mustRecordFieldBuilder(b, "payload").(*array.StringBuilder), + createdAt: mustRecordFieldBuilder(b, "created_at").(*array.TimestampBuilder), + } +} + +type jsonPhysicalTypeSpec struct { + name string + dataType arrow.DataType + arrowExtension string + expected any + appendValue func(*testing.T, array.Builder) +} + +const ( + jsonStructKindField = iota + jsonStructCountField +) + +func jsonPhysicalTypeSpecs(t *testing.T) []jsonPhysicalTypeSpec { t.Helper() - pool := memory.NewGoAllocator() structType := arrow.StructOf( arrow.Field{Name: "kind", Type: arrow.BinaryTypes.String, Nullable: false}, arrow.Field{Name: "count", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, ) + geoPointType := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) arrowJSONType, err := extensions.NewJSONType(arrow.BinaryTypes.String) require.NoError(t, err) - schema := arrow.NewSchema([]arrow.Field{ + + return []jsonPhysicalTypeSpec{ + {name: "payload", dataType: arrow.BinaryTypes.String, expected: map[string]any{"kind": "string"}, appendValue: appendJSONText(`{"kind":"string"}`)}, + {name: "payload_large_string", dataType: arrow.BinaryTypes.LargeString, expected: map[string]any{"kind": "large_string"}, appendValue: appendJSONText(`{"kind":"large_string"}`)}, + {name: "payload_string_view", dataType: arrow.BinaryTypes.StringView, expected: map[string]any{"kind": "string_view"}, appendValue: appendJSONText(`{"kind":"string_view"}`)}, + {name: "payload_binary", dataType: arrow.BinaryTypes.Binary, expected: map[string]any{"kind": "binary"}, appendValue: appendJSONText(`{"kind":"binary"}`)}, + {name: "payload_large_binary", dataType: arrow.BinaryTypes.LargeBinary, expected: map[string]any{"kind": "large_binary"}, appendValue: appendJSONText(`{"kind":"large_binary"}`)}, + {name: "payload_binary_view", dataType: arrow.BinaryTypes.BinaryView, expected: map[string]any{"kind": "binary_view"}, appendValue: appendJSONText(`{"kind":"binary_view"}`)}, + {name: "payload_struct", dataType: structType, expected: map[string]any{"kind": "struct", "count": float64(14)}, appendValue: appendJSONStruct("struct", 14)}, + {name: "payload_list", dataType: arrow.ListOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(1), float64(2)}, appendValue: appendInt64JSONList(1, 2)}, + {name: "payload_large_list", dataType: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(3), float64(4)}, appendValue: appendInt64JSONList(3, 4)}, + {name: "payload_fixed_size_list", dataType: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), expected: []any{float64(5), float64(6)}, appendValue: appendInt64JSONList(5, 6)}, + {name: "payload_list_view", dataType: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(7), float64(8)}, appendValue: appendInt64JSONList(7, 8)}, + {name: "payload_large_list_view", dataType: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(9), float64(10)}, appendValue: appendInt64JSONList(9, 10)}, + {name: "payload_map", dataType: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), expected: map[string]any{"a": float64(11), "b": float64(12)}, appendValue: appendInt64JSONMap([]string{"a", "b"}, []int64{11, 12})}, + {name: "payload_scalar", dataType: arrow.PrimitiveTypes.Int64, expected: "13", appendValue: appendInt64JSONScalar(13)}, + {name: "payload_arrow_json", dataType: arrowJSONType, expected: map[string]any{"kind": "arrow_json"}, appendValue: appendArrowJSONText(`{"kind":"arrow_json"}`)}, + {name: "payload_geo_point", dataType: geoPointType, arrowExtension: "geoarrow.point", expected: geoJSONGeometry("Point", pointCoordinate(xyPoint{x: 30.5, y: 50.25})), appendValue: appendGeoArrowJSONPoint(xyPoint{x: 30.5, y: 50.25})}, + } +} + +func jsonPhysicalTypeColumns(t *testing.T) []string { + t.Helper() + specs := jsonPhysicalTypeSpecs(t) + columns := make([]string, 0, len(specs)) + for _, spec := range specs { + columns = append(columns, spec.name) + } + return columns +} + +func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + specs := jsonPhysicalTypeSpecs(t) + fields := []arrow.Field{ {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, - {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: false}, - {Name: "payload_large_string", Type: arrow.BinaryTypes.LargeString, Nullable: false}, - {Name: "payload_string_view", Type: arrow.BinaryTypes.StringView, Nullable: false}, - {Name: "payload_binary", Type: arrow.BinaryTypes.Binary, Nullable: false}, - {Name: "payload_large_binary", Type: arrow.BinaryTypes.LargeBinary, Nullable: false}, - {Name: "payload_binary_view", Type: arrow.BinaryTypes.BinaryView, Nullable: false}, - {Name: "payload_struct", Type: structType, Nullable: false}, - {Name: "payload_list", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_large_list", Type: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_fixed_size_list", Type: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_list_view", Type: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_large_list_view", Type: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_map", Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), Nullable: false}, - {Name: "payload_scalar", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, - {Name: "payload_arrow_json", Type: arrowJSONType, Nullable: false}, - }, nil) + } + for _, spec := range specs { + field := arrow.Field{Name: spec.name, Type: spec.dataType, Nullable: false} + if spec.arrowExtension != "" { + field.Metadata = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": spec.arrowExtension}) + } + fields = append(fields, field) + } + schema := arrow.NewSchema(fields, nil) b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).Append("json-physical-types") - b.Field(1).(*array.Float64Builder).Append(1) - b.Field(2).(*array.BooleanBuilder).Append(true) - b.Field(3).(*array.StringBuilder).Append(`{"kind":"string"}`) - b.Field(4).(*array.LargeStringBuilder).Append(`{"kind":"large_string"}`) - b.Field(5).(*array.StringViewBuilder).Append(`{"kind":"string_view"}`) - b.Field(6).(*array.BinaryBuilder).Append([]byte(`{"kind":"binary"}`)) - b.Field(7).(*array.BinaryBuilder).Append([]byte(`{"kind":"large_binary"}`)) - b.Field(8).(*array.BinaryViewBuilder).Append([]byte(`{"kind":"binary_view"}`)) - - structBuilder := b.Field(9).(*array.StructBuilder) - structBuilder.Append(true) - structBuilder.FieldBuilder(0).(*array.StringBuilder).Append("struct") - structBuilder.FieldBuilder(1).(*array.Int64Builder).Append(14) - - listBuilder := b.Field(10).(*array.ListBuilder) - listBuilder.Append(true) - listBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{1, 2}, nil) - largeListBuilder := b.Field(11).(*array.LargeListBuilder) - largeListBuilder.Append(true) - largeListBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{3, 4}, nil) - fixedListBuilder := b.Field(12).(*array.FixedSizeListBuilder) - fixedListBuilder.Append(true) - fixedListBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{5, 6}, nil) - listViewBuilder := b.Field(13).(*array.ListViewBuilder) - listViewBuilder.AppendWithSize(true, 2) - listViewBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{7, 8}, nil) - largeListViewBuilder := b.Field(14).(*array.LargeListViewBuilder) - largeListViewBuilder.AppendWithSize(true, 2) - largeListViewBuilder.ValueBuilder().(*array.Int64Builder).AppendValues([]int64{9, 10}, nil) - mapBuilder := b.Field(15).(*array.MapBuilder) - mapBuilder.Append(true) - mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues([]string{"a", "b"}, nil) - mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues([]int64{11, 12}, nil) - b.Field(16).(*array.Int64Builder).Append(13) - arrowJSONBuilder := b.Field(17).(*array.ExtensionBuilder) - arrowJSONBuilder.StorageBuilder().(*array.StringBuilder).Append(`{"kind":"arrow_json"}`) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("json-physical-types") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(1) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(true) + for _, spec := range specs { + spec.appendValue(t, recordFieldBuilder(t, b, spec.name)) + } return b.NewRecordBatch() } -func jsonPhysicalTypesExpected() map[string]any { - return map[string]any{ - "name": "json-physical-types", - "payload": map[string]any{"kind": "string"}, - "payload_large_string": map[string]any{"kind": "large_string"}, - "payload_string_view": map[string]any{"kind": "string_view"}, - "payload_binary": map[string]any{"kind": "binary"}, - "payload_large_binary": map[string]any{"kind": "large_binary"}, - "payload_binary_view": map[string]any{"kind": "binary_view"}, - "payload_struct": map[string]any{"kind": "struct", "count": float64(14)}, - "payload_list": []any{float64(1), float64(2)}, - "payload_large_list": []any{float64(3), float64(4)}, - "payload_fixed_size_list": []any{float64(5), float64(6)}, - "payload_list_view": []any{float64(7), float64(8)}, - "payload_large_list_view": []any{float64(9), float64(10)}, - "payload_map": map[string]any{"a": float64(11), "b": float64(12)}, - "payload_scalar": "13", - "payload_arrow_json": map[string]any{"kind": "arrow_json"}, +func appendJSONText(value string) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + switch b := builder.(type) { + case *array.StringBuilder: + b.Append(value) + case *array.LargeStringBuilder: + b.Append(value) + case *array.StringViewBuilder: + b.Append(value) + case *array.BinaryBuilder: + b.Append([]byte(value)) + case *array.BinaryViewBuilder: + b.Append([]byte(value)) + default: + require.Failf(t, "unsupported JSON text builder", "got %T", builder) + } + } +} + +func appendJSONStruct(kind string, count int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + structBuilder, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + structBuilder.Append(true) + structBuilder.FieldBuilder(jsonStructKindField).(*array.StringBuilder).Append(kind) + structBuilder.FieldBuilder(jsonStructCountField).(*array.Int64Builder).Append(count) } } +func appendInt64JSONList(values ...int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + switch b := builder.(type) { + case *array.ListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.LargeListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.FixedSizeListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.ListViewBuilder: + b.AppendWithSize(true, len(values)) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.LargeListViewBuilder: + b.AppendWithSize(true, len(values)) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + default: + require.Failf(t, "unsupported JSON list builder", "got %T", builder) + } + } +} + +func appendInt64JSONMap(keys []string, values []int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + mapBuilder, ok := builder.(*array.MapBuilder) + require.Truef(t, ok, "got %T, want *array.MapBuilder", builder) + mapBuilder.Append(true) + mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues(keys, nil) + mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues(values, nil) + } +} + +func appendInt64JSONScalar(value int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + intBuilder, ok := builder.(*array.Int64Builder) + require.Truef(t, ok, "got %T, want *array.Int64Builder", builder) + intBuilder.Append(value) + } +} + +func appendArrowJSONText(value string) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + extensionBuilder, ok := builder.(*array.ExtensionBuilder) + require.Truef(t, ok, "got %T, want *array.ExtensionBuilder", builder) + extensionBuilder.StorageBuilder().(*array.StringBuilder).Append(value) + } +} + +func appendGeoArrowJSONPoint(point xyPoint) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + structBuilder, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + appendPoint(structBuilder, point) + } +} + +func jsonPhysicalTypesExpected(t *testing.T) map[string]any { + t.Helper() + expected := map[string]any{"name": "json-physical-types"} + for _, spec := range jsonPhysicalTypeSpecs(t) { + expected[spec.name] = spec.expected + } + return expected +} + func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, dsName string) { t.Helper() query := fmt.Sprintf(`{ @@ -413,7 +525,7 @@ func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, %s } } - }`, dsName, strings.Join(jsonPhysicalTypeColumns, "\n")) + }`, dsName, strings.Join(jsonPhysicalTypeColumns(t), "\n")) res, err := service.Query(context.Background(), query, nil) require.NoError(t, err) defer res.Close() @@ -427,7 +539,7 @@ func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, root := data[dsName].(map[string]any) rows := root["events"].([]any) require.Len(t, rows, 1, "response: %s", string(body)) - assert.Equal(t, jsonPhysicalTypesExpected(), rows[0]) + assert.Equal(t, jsonPhysicalTypesExpected(t), rows[0]) } // --- Tests ---------------------------------------------------------------- @@ -491,7 +603,7 @@ func TestIngest_Postgres_JSONPhysicalTypes(t *testing.T) { res, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) require.NoError(t, err) assert.Equal(t, int64(1), res.Inserted) - expectedColumns := append([]string{"name", "value", "is_active"}, jsonPhysicalTypeColumns...) + expectedColumns := append([]string{"name", "value", "is_active"}, jsonPhysicalTypeColumns(t)...) assert.ElementsMatch(t, expectedColumns, res.Columns) assertJSONPhysicalTypesReadThroughHugr(t, env.service, env.dsName) } @@ -532,10 +644,10 @@ func TestIngest_Postgres_UsesBinaryCopyWithoutTextOnlyTypes(t *testing.T) { }, }, nil) b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) - b.Field(0).(*array.StringBuilder).Append("binary-copy") - b.Field(1).(*array.Float64Builder).Append(42) - b.Field(2).(*array.StringBuilder).Append("POINT (7.25 8.5)") - rec := b.NewRecord() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("binary-copy") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(42) + recordFieldBuilder(t, b, "geom").(*array.StringBuilder).Append("POINT (7.25 8.5)") + rec := b.NewRecordBatch() b.Release() defer rec.Release() @@ -585,8 +697,8 @@ func TestIngest_Postgres_GeometryEdgeCases(t *testing.T) { }, nil) b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) - names := b.Field(0).(*array.StringBuilder) - geoms := b.Field(1).(*array.StringBuilder) + names := recordFieldBuilder(t, b, "name").(*array.StringBuilder) + geoms := recordFieldBuilder(t, b, "geom").(*array.StringBuilder) names.Append("a_null") geoms.AppendNull() @@ -597,7 +709,7 @@ func TestIngest_Postgres_GeometryEdgeCases(t *testing.T) { names.Append("d_geomcollection") geoms.Append("GEOMETRYCOLLECTION(POINT(1 2),LINESTRING(0 0,1 1))") - rec := b.NewRecord() + rec := b.NewRecordBatch() b.Release() defer rec.Release() @@ -773,24 +885,25 @@ func TestIngest_Postgres_MultipleBatches(t *testing.T) { mk := func(names []string) arrow.RecordBatch { b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues(names, nil) + fields := eventsRecordBuildersFor(b) + fields.names.AppendValues(names, nil) vals := make([]float64, len(names)) for i := range vals { vals[i] = float64(i) } - b.Field(1).(*array.Float64Builder).AppendValues(vals, nil) + fields.values.AppendValues(vals, nil) active := make([]bool, len(names)) for i := range active { active[i] = true } - b.Field(2).(*array.BooleanBuilder).AppendValues(active, nil) - b.Field(3).(*array.StringBuilder).AppendNulls(len(names)) + fields.active.AppendValues(active, nil) + fields.payloads.AppendNulls(len(names)) ts := make([]arrow.Timestamp, len(names)) for i := range ts { ts[i] = arrow.Timestamp(time.Now().UTC().UnixMicro()) } - b.Field(4).(*array.TimestampBuilder).AppendValues(ts, nil) - return b.NewRecord() + fields.createdAt.AppendValues(ts, nil) + return b.NewRecordBatch() } rec1 := mk([]string{"a", "b"}) defer rec1.Release() @@ -897,10 +1010,10 @@ func TestIngest_Postgres_Stream(t *testing.T) { {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, }, nil) b := array.NewRecordBuilder(pool, schema) - b.Field(0).(*array.StringBuilder).AppendValues([]string{"s1", "s2"}, nil) - b.Field(1).(*array.Float64Builder).AppendValues([]float64{10, 20}, nil) - b.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true, false}, nil) - rec := b.NewRecord() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues([]string{"s1", "s2"}, nil) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).AppendValues([]float64{10, 20}, nil) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).AppendValues([]bool{true, false}, nil) + rec := b.NewRecordBatch() b.Release() defer rec.Release() @@ -951,80 +1064,207 @@ func eventsArrowSchema() *arrow.Schema { }, nil) } -func makeGeometryTypesRecord(t *testing.T, names []string, points [][2]float64) (arrow.RecordBatch, *arrow.Schema) { +func eventsArrowFileSchema() *arrow.Schema { + fields := append([]arrow.Field{}, eventsArrowSchema().Fields()...) + fields = append(fields, geometryArrowFields()...) + return arrow.NewSchema(fields, nil) +} + +type geometryTypesRow struct { + name string + value float64 + active bool + point xyPoint + shapeOrigin xyPoint +} + +func makeGeometryTypesRecord(t *testing.T, rows []geometryTypesRow) (arrow.RecordBatch, *arrow.Schema) { t.Helper() - require.Len(t, points, len(names)) schema := geometryTypesSchema() pool := memory.NewGoAllocator() b := array.NewRecordBuilder(pool, schema) defer b.Release() - for i, name := range names { - appendGeometryTypesRow(b, name, float64(i+1), true, points[i], float64(i), float64(i)) + for _, row := range rows { + appendGeometryTypesRow(t, b, row) } return b.NewRecordBatch(), schema } func geometryTypesSchema() *arrow.Schema { + fields := []arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + } + fields = append(fields, geometryArrowFields()...) + return arrow.NewSchema(fields, nil) +} + +func geometryArrowFields() []arrow.Field { pointType := arrow.StructOf( arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, ) lineType := arrow.ListOf(pointType) polygonType := arrow.ListOf(lineType) - return arrow.NewSchema([]arrow.Field{ - {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, - {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, - {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, - {Name: "geom", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, - {Name: "geom_4326", Type: pointType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.point"})}, - {Name: "geom_wkt", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, - {Name: "geom_wkt_4326", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"})}, - {Name: "geom_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.geojson"})}, - {Name: "geom_hugr_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "hugr.geojson"})}, - {Name: "geom_plain_geojson", Type: arrow.BinaryTypes.String, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geojson"})}, - {Name: "geom_wkb", Type: arrow.BinaryTypes.Binary, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkb"})}, - {Name: "geom_line", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.linestring"})}, - {Name: "geom_polygon_native", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.polygon"})}, - {Name: "geom_multipoint", Type: lineType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multipoint"})}, - {Name: "geom_multiline", Type: polygonType, Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multilinestring"})}, - {Name: "geom_multipolygon", Type: arrow.ListOf(polygonType), Nullable: false, Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.multipolygon"})}, - }, nil) + fields := make([]arrow.Field, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + field := arrow.Field{ + Name: col.name, + Type: col.arrowType, + Nullable: false, + } + if col.arrowExtension != "" { + field.Metadata = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": col.arrowExtension}) + } + fields = append(fields, field) + } + return fields +} + +type geometryValueColumn struct { + name string + arrowType arrow.DataType + arrowExtension string + expectedWKT func(point, x, y string) string + expectedSRID int +} + +func geometryValueColumns(pointType, lineType, polygonType arrow.DataType) []geometryValueColumn { + geoJSONStructType := arrow.StructOf( + arrow.Field{Name: "type", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "coordinates", Type: arrow.ListOf(arrow.ListOf(arrow.ListOf(arrow.PrimitiveTypes.Float64))), Nullable: false}, + ) + line := func(_ string, x string, y string) string { + return fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)) + } + polygon := func(_ string, x string, y string) string { return polygonWKT(x, y) } + point := func(point string, _ string, _ string) string { return point } + multiPoint := func(_ string, x string, y string) string { + return fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y) + } + multiLine := func(_ string, x string, y string) string { + return fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)) + } + multiPolygon := func(_ string, x string, y string) string { return multiPolygonWKT(x, y) } + + return []geometryValueColumn{ + {name: "geom", arrowType: pointType, arrowExtension: "geoarrow.point", expectedWKT: point}, + {name: "geom_4326", arrowType: pointType, arrowExtension: "geoarrow.point", expectedWKT: point, expectedSRID: 4326}, + {name: "geom_wkt", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.wkt", expectedWKT: line}, + {name: "geom_wkt_4326", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.wkt", expectedWKT: line, expectedSRID: 4326}, + {name: "geom_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.geojson", expectedWKT: polygon}, + {name: "geom_hugr_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "hugr.geojson", expectedWKT: polygon}, + {name: "geom_plain_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "geojson", expectedWKT: polygon}, + {name: "geom_geojson_struct", arrowType: geoJSONStructType, expectedWKT: polygon}, + {name: "geom_geojson_arrow_json", arrowType: mustArrowJSONType(), arrowExtension: "arrow.json", expectedWKT: polygon}, + {name: "geom_wkb", arrowType: arrow.BinaryTypes.Binary, arrowExtension: "geoarrow.wkb", expectedWKT: point}, + {name: "geom_hexwkb", arrowType: arrow.BinaryTypes.String, arrowExtension: "hugr.hexwkb", expectedWKT: point}, + {name: "geom_line", arrowType: lineType, arrowExtension: "geoarrow.linestring", expectedWKT: line}, + {name: "geom_polygon_native", arrowType: polygonType, arrowExtension: "geoarrow.polygon", expectedWKT: polygon}, + {name: "geom_multipoint", arrowType: lineType, arrowExtension: "geoarrow.multipoint", expectedWKT: multiPoint}, + {name: "geom_multiline", arrowType: polygonType, arrowExtension: "geoarrow.multilinestring", expectedWKT: multiLine}, + {name: "geom_multipolygon", arrowType: arrow.ListOf(polygonType), arrowExtension: "geoarrow.multipolygon", expectedWKT: multiPolygon}, + } +} + +func mustArrowJSONType() arrow.DataType { + typ, err := extensions.NewJSONType(arrow.BinaryTypes.String) + if err != nil { + panic(err) + } + return typ } func geometryTypesColumns() []string { - return []string{ - "name", "value", "is_active", - "geom", "geom_4326", "geom_wkt", "geom_wkt_4326", "geom_geojson", - "geom_hugr_geojson", "geom_plain_geojson", "geom_wkb", - "geom_line", "geom_polygon_native", "geom_multipoint", - "geom_multiline", "geom_multipolygon", + pointType, lineType, polygonType := geometryArrowTypes() + columns := []string{"name", "value", "is_active"} + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + columns = append(columns, col.name) } + return columns } func geometryExpected(point, x, y string) []string { - line := fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)) - return []string{ - point, - point, - line, - line, - polygonWKT(x, y), - polygonWKT(x, y), - polygonWKT(x, y), - point, - line, - polygonWKT(x, y), - fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y), - fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)), - multiPolygonWKT(x, y), + pointType, lineType, polygonType := geometryArrowTypes() + values := make([]string, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + values = append(values, col.expectedWKT(point, x, y)) } + return values } func geometrySRIDExpected() []int { - return []int{0, 4326, 0, 4326, 0, 0, 0, 0, 0, 0, 0, 0, 0} + pointType, lineType, polygonType := geometryArrowTypes() + srids := make([]int, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + srids = append(srids, col.expectedSRID) + } + return srids +} + +func geometryArrowTypes() (pointType, lineType, polygonType arrow.DataType) { + pointType = arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + lineType = arrow.ListOf(pointType) + polygonType = arrow.ListOf(lineType) + return pointType, lineType, polygonType +} + +func geometrySelectList(withSRID bool) string { + pointType, lineType, polygonType := geometryArrowTypes() + exprs := make([]string, 0, len(geometryValueColumns(pointType, lineType, polygonType))*2) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + exprs = append(exprs, "ST_AsText("+col.name+")") + if withSRID { + exprs = append(exprs, "ST_SRID("+col.name+")") + } + } + return strings.Join(exprs, ",\n") +} + +type sqlScanner interface { + Scan(dest ...any) error +} + +func scanGeometryValuesWithSRID(t *testing.T, scanner sqlScanner) ([]string, []int) { + t.Helper() + pointType, lineType, polygonType := geometryArrowTypes() + columns := geometryValueColumns(pointType, lineType, polygonType) + values := make([]string, len(columns)) + srids := make([]int, len(columns)) + scanArgs := make([]any, 0, len(columns)*2) + for i := range columns { + scanArgs = append(scanArgs, &values[i], &srids[i]) + } + require.NoError(t, scanner.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + return values, srids +} + +func scanNamedGeometryValuesWithSRID(t *testing.T, rows *sql.Rows) (string, []string, []int) { + t.Helper() + pointType, lineType, polygonType := geometryArrowTypes() + columns := geometryValueColumns(pointType, lineType, polygonType) + var name string + values := make([]string, len(columns)) + srids := make([]int, len(columns)) + scanArgs := []any{&name} + for i := range columns { + scanArgs = append(scanArgs, &values[i], &srids[i]) + } + require.NoError(t, rows.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + return name, values, srids } func polygonWKT(x, y string) string { @@ -1076,7 +1316,10 @@ func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, geom_geojson geom_hugr_geojson geom_plain_geojson + geom_geojson_struct + geom_geojson_arrow_json geom_wkb + geom_hexwkb geom_line geom_polygon_native geom_multipoint @@ -1112,22 +1355,25 @@ func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, assert.Equal(t, expected, got) } -func geometryReadExpected(name string, point [2]float64, x, y float64) map[string]any { +func geometryReadExpected(name string, point xyPoint, x, y float64) map[string]any { return map[string]any{ - "name": name, - "geom": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), - "geom_4326": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), - "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), - "geom_wkt_4326": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), - "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), - "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), - "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), - "geom_wkb": geoJSONGeometry("Point", pointCoordinate(xyPoint{point[0], point[1]})), - "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), - "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), - "geom_multipoint": geoJSONGeometry("MultiPoint", pointCoordinates(multiPoints(x, y))), - "geom_multiline": geoJSONGeometry("MultiLineString", nestedPointCoordinates(multiLines(x, y))), - "geom_multipolygon": geoJSONGeometry("MultiPolygon", deepPointCoordinates(multiPolygons(x, y))), + "name": name, + "geom": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_4326": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_wkt_4326": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_geojson_struct": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_geojson_arrow_json": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_wkb": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_hexwkb": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_multipoint": geoJSONGeometry("MultiPoint", pointCoordinates(multiPoints(x, y))), + "geom_multiline": geoJSONGeometry("MultiLineString", nestedPointCoordinates(multiLines(x, y))), + "geom_multipolygon": geoJSONGeometry("MultiPolygon", deepPointCoordinates(multiPolygons(x, y))), } } @@ -1139,7 +1385,7 @@ func geoJSONGeometry(typ string, coordinates any) map[string]any { } func pointCoordinate(point xyPoint) []any { - return []any{point[0], point[1]} + return []any{point.x, point.y} } func pointCoordinates(points []xyPoint) []any { @@ -1184,54 +1430,107 @@ func compactWKT(s string) string { return s } -func buildGeometryTypesBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { +func buildGeometryTypesBatch(t *testing.T, pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { + t.Helper() b := array.NewRecordBuilder(pool, schema) defer b.Release() for i := 0; i < rowsPerBatch; i++ { row := batchIdx*rowsPerBatch + i - x := float64(row % 100) - y := float64(row / 1000) - appendGeometryTypesRow(b, fmt.Sprintf("%s-%06d", namePrefix, row), float64(row)*0.5, row%2 == 0, [2]float64{x, y}, x, y) + name, point := geometryBatchRow(namePrefix, row) + appendGeometryTypesRow(t, b, geometryTypesRow{ + name: name, + value: float64(row) * 0.5, + active: row%2 == 0, + point: point, + shapeOrigin: point, + }) } return b.NewRecordBatch() } -func appendGeometryTypesRow(b *array.RecordBuilder, name string, value float64, active bool, point [2]float64, shapeX, shapeY float64) { - b.Field(0).(*array.StringBuilder).Append(name) - b.Field(1).(*array.Float64Builder).Append(value) - b.Field(2).(*array.BooleanBuilder).Append(active) +func geometryBatchRow(namePrefix string, row int) (string, xyPoint) { + return fmt.Sprintf("%s-%06d", namePrefix, row), xyPoint{ + x: float64(row % 100), + y: float64(row / 1000), + } +} - sb := b.Field(3).(*array.StructBuilder) - sb.Append(true) - sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) - sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) +func appendGeometryTypesRow(t *testing.T, b *array.RecordBuilder, row geometryTypesRow) { + t.Helper() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append(row.name) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(row.value) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(row.active) + appendGeometryValueFields(t, b, row) +} - sb = b.Field(4).(*array.StructBuilder) - sb.Append(true) - sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) - sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) +func appendGeometryValueFields(t *testing.T, b *array.RecordBuilder, row geometryTypesRow) { + t.Helper() + x, y := row.shapeOrigin.x, row.shapeOrigin.y + + appendPoint(recordFieldBuilder(t, b, "geom").(*array.StructBuilder), row.point) + appendPoint(recordFieldBuilder(t, b, "geom_4326").(*array.StructBuilder), row.point) + recordFieldBuilder(t, b, "geom_wkt").(*array.StringBuilder).Append(lineWKT(x, y)) + recordFieldBuilder(t, b, "geom_wkt_4326").(*array.StringBuilder).Append(lineWKT(x, y)) + recordFieldBuilder(t, b, "geom_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + recordFieldBuilder(t, b, "geom_hugr_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + recordFieldBuilder(t, b, "geom_plain_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + appendGeoJSONPolygonStruct(t, recordFieldBuilder(t, b, "geom_geojson_struct"), x, y) + recordFieldBuilder(t, b, "geom_geojson_arrow_json").(*array.ExtensionBuilder).StorageBuilder().(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + + wkbPoint, err := wkb.Marshal(orb.Point{row.point.x, row.point.y}) + require.NoError(t, err) + recordFieldBuilder(t, b, "geom_wkb").(*array.BinaryBuilder).Append(wkbPoint) + recordFieldBuilder(t, b, "geom_hexwkb").(*array.StringBuilder).Append(strings.ToUpper(hex.EncodeToString(wkbPoint))) + appendPointList(recordFieldBuilder(t, b, "geom_line").(*array.ListBuilder), linePoints(x, y)) + appendPointListList(recordFieldBuilder(t, b, "geom_polygon_native").(*array.ListBuilder), polygonRings(x, y)) + appendPointList(recordFieldBuilder(t, b, "geom_multipoint").(*array.ListBuilder), multiPoints(x, y)) + appendPointListList(recordFieldBuilder(t, b, "geom_multiline").(*array.ListBuilder), multiLines(x, y)) + appendPointListListList(recordFieldBuilder(t, b, "geom_multipolygon").(*array.ListBuilder), multiPolygons(x, y)) +} - b.Field(5).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) - b.Field(6).(*array.StringBuilder).Append(lineWKT(shapeX, shapeY)) - b.Field(7).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) - b.Field(8).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) - b.Field(9).(*array.StringBuilder).Append(polygonGeoJSON(shapeX, shapeY)) - wkbPoint, _ := wkb.Marshal(orb.Point{point[0], point[1]}) - b.Field(10).(*array.BinaryBuilder).Append(wkbPoint) - appendPointList(b.Field(11).(*array.ListBuilder), linePoints(shapeX, shapeY)) - appendPointListList(b.Field(12).(*array.ListBuilder), polygonRings(shapeX, shapeY)) - appendPointList(b.Field(13).(*array.ListBuilder), multiPoints(shapeX, shapeY)) - appendPointListList(b.Field(14).(*array.ListBuilder), multiLines(shapeX, shapeY)) - appendPointListListList(b.Field(15).(*array.ListBuilder), multiPolygons(shapeX, shapeY)) +type xyPoint struct { + x float64 + y float64 } -type xyPoint [2]float64 +const ( + geoArrowPointXField = iota + geoArrowPointYField +) + +const ( + geoJSONGeometryTypeField = iota + geoJSONGeometryCoordinatesField +) func appendPoint(sb *array.StructBuilder, point xyPoint) { sb.Append(true) - sb.FieldBuilder(0).(*array.Float64Builder).Append(point[0]) - sb.FieldBuilder(1).(*array.Float64Builder).Append(point[1]) + sb.FieldBuilder(geoArrowPointXField).(*array.Float64Builder).Append(point.x) + sb.FieldBuilder(geoArrowPointYField).(*array.Float64Builder).Append(point.y) +} + +func appendGeoJSONPolygonStruct(t *testing.T, builder array.Builder, x, y float64) { + t.Helper() + sb, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + + sb.Append(true) + sb.FieldBuilder(geoJSONGeometryTypeField).(*array.StringBuilder).Append("Polygon") + appendGeoJSONPolygonCoordinates(sb.FieldBuilder(geoJSONGeometryCoordinatesField).(*array.ListBuilder), polygonRings(x, y)) +} + +func appendGeoJSONPolygonCoordinates(lb *array.ListBuilder, rings [][]xyPoint) { + lb.Append(true) + ringBuilder := lb.ValueBuilder().(*array.ListBuilder) + for _, ring := range rings { + ringBuilder.Append(true) + pointBuilder := ringBuilder.ValueBuilder().(*array.ListBuilder) + for _, point := range ring { + pointBuilder.Append(true) + pointBuilder.ValueBuilder().(*array.Float64Builder).AppendValues([]float64{point.x, point.y}, nil) + } + } } func appendPointList(lb *array.ListBuilder, points []xyPoint) { @@ -1259,31 +1558,31 @@ func appendPointListListList(lb *array.ListBuilder, polygons [][][]xyPoint) { } func linePoints(x, y float64) []xyPoint { - return []xyPoint{{x, y}, {x + 1, y + 1}, {x + 2, y + 1}} + return []xyPoint{{x: x, y: y}, {x: x + 1, y: y + 1}, {x: x + 2, y: y + 1}} } func polygonRings(x, y float64) [][]xyPoint { return [][]xyPoint{ - {{x, y}, {x, y + 4}, {x + 4, y + 4}, {x + 4, y}, {x, y}}, - {{x + 1, y + 1}, {x + 2, y + 1}, {x + 2, y + 2}, {x + 1, y + 2}, {x + 1, y + 1}}, + {{x: x, y: y}, {x: x, y: y + 4}, {x: x + 4, y: y + 4}, {x: x + 4, y: y}, {x: x, y: y}}, + {{x: x + 1, y: y + 1}, {x: x + 2, y: y + 1}, {x: x + 2, y: y + 2}, {x: x + 1, y: y + 2}, {x: x + 1, y: y + 1}}, } } func multiPoints(x, y float64) []xyPoint { - return []xyPoint{{x, y}, {x + 1, y + 1}, {x + 2, y}} + return []xyPoint{{x: x, y: y}, {x: x + 1, y: y + 1}, {x: x + 2, y: y}} } func multiLines(x, y float64) [][]xyPoint { return [][]xyPoint{ - {{x, y}, {x + 1, y + 1}}, - {{x + 2, y + 2}, {x + 3, y + 3}}, + {{x: x, y: y}, {x: x + 1, y: y + 1}}, + {{x: x + 2, y: y + 2}, {x: x + 3, y: y + 3}}, } } func multiPolygons(x, y float64) [][][]xyPoint { return [][][]xyPoint{ polygonRings(x, y), - {{{x + 10, y + 10}, {x + 10, y + 12}, {x + 12, y + 12}, {x + 12, y + 10}, {x + 10, y + 10}}}, + {{{x: x + 10, y: y + 10}, {x: x + 10, y: y + 12}, {x: x + 12, y: y + 12}, {x: x + 12, y: y + 10}, {x: x + 10, y: y + 10}}}, } } @@ -1294,6 +1593,10 @@ func lineWKT(x, y float64) string { coord(x+2), coord(y+1)) } +func pointWKT(point xyPoint) string { + return fmt.Sprintf("POINT(%s %s)", coord(point.x), coord(point.y)) +} + func polygonGeoJSON(x, y float64) string { return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]],[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, coord(x), coord(y), @@ -1312,61 +1615,74 @@ func coord(v float64) string { return strconv.FormatFloat(v, 'f', -1, 64) } -// writeEventsArrowFile produces an Arrow IPC file at path in the given -// format with numBatches × rowsPerBatch synthetic events rows. namePrefix is -// embedded in the `name` column so different tests can write to the same -// table without colliding on uniqueness assertions. -func writeEventsArrowFile(t *testing.T, path, namePrefix string, format arrowFileFormat, numBatches, rowsPerBatch int) { - t.Helper() - pool := memory.NewGoAllocator() - schema := eventsArrowSchema() +type arrowIPCRecordWriter interface { + Write(arrow.RecordBatch) error + Close() error +} - f, err := os.Create(path) - require.NoError(t, err) - defer f.Close() +func newArrowIPCRecordWriter(t *testing.T, f *os.File, schema *arrow.Schema, format arrowFileFormat) arrowIPCRecordWriter { + t.Helper() - type writer interface { - Write(arrow.RecordBatch) error - Close() error - } - var w writer switch format { case arrowStreamFormat: - w = ipc.NewWriter(f, ipc.WithSchema(schema)) + return ipc.NewWriter(f, ipc.WithSchema(schema)) case arrowFileFmt: - fw, ferr := ipc.NewFileWriter(f, ipc.WithSchema(schema)) - require.NoError(t, ferr) - w = fw + w, err := ipc.NewFileWriter(f, ipc.WithSchema(schema)) + require.NoError(t, err) + return w default: t.Fatalf("unknown arrow file format: %d", format) + return nil } +} - base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) +func writeArrowIPCFile(t *testing.T, path string, schema *arrow.Schema, format arrowFileFormat, numBatches int, buildBatch func(batchIdx int) arrow.RecordBatch) { + t.Helper() + + f, err := os.Create(path) + require.NoError(t, err) + defer f.Close() + + w := newArrowIPCRecordWriter(t, f, schema, format) for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rec := buildBatch(batchIdx) + require.NoError(t, w.Write(rec)) + rec.Release() + } + require.NoError(t, w.Close()) +} + +// writeEventsArrowFile produces an Arrow IPC file at path in the given +// format with numBatches × rowsPerBatch synthetic events rows. namePrefix is +// embedded in the `name` column so different tests can write to the same +// table without colliding on uniqueness assertions. +func writeEventsArrowFile(t *testing.T, path, namePrefix string, format arrowFileFormat, numBatches, rowsPerBatch int) { + t.Helper() + pool := memory.NewGoAllocator() + schema := eventsArrowFileSchema() + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + writeArrowIPCFile(t, path, schema, format, numBatches, func(batchIdx int) arrow.RecordBatch { rb := array.NewRecordBuilder(pool, schema) - names := rb.Field(0).(*array.StringBuilder) - values := rb.Field(1).(*array.Float64Builder) - active := rb.Field(2).(*array.BooleanBuilder) - payloads := rb.Field(3).(*array.StringBuilder) - ts := rb.Field(4).(*array.TimestampBuilder) + fields := eventsRecordBuildersFor(rb) for i := 0; i < rowsPerBatch; i++ { row := batchIdx*rowsPerBatch + i - names.Append(fmt.Sprintf("%s-%06d", namePrefix, row)) - values.Append(float64(row) * 0.5) - active.Append(row%2 == 0) + name, point := geometryBatchRow(namePrefix, row) + fields.names.Append(name) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) if row%5 == 0 { - payloads.AppendNull() + fields.payloads.AppendNull() } else { - payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) } - ts.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + appendGeometryValueFields(t, rb, geometryTypesRow{point: point, shapeOrigin: point}) } - rec := rb.NewRecord() + rec := rb.NewRecordBatch() rb.Release() - require.NoError(t, w.Write(rec)) - rec.Release() - } - require.NoError(t, w.Close()) + return rec + }) } // TestIngest_Postgres_ArrowIPCFile_StreamFormat builds a 50×1000-row Arrow @@ -1437,6 +1753,7 @@ func TestIngest_Postgres_ArrowIPCFile_StreamFormat(t *testing.T) { var activeCount int require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) assert.Equal(t, totalRows/2, activeCount) + assertArrowIPCFileGeometry(t, env, namePrefix, totalRows) t.Logf("arrow ipc stream file ingest: %d rows from %d-batch file in %s (%.0f rows/s)", totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) @@ -1508,11 +1825,27 @@ func TestIngest_Postgres_ArrowIPCFile_FileFormat(t *testing.T) { var activeCount int require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) assert.Equal(t, totalRows/2, activeCount) + assertArrowIPCFileGeometry(t, env, namePrefix, totalRows) t.Logf("arrow ipc file-format ingest: %d rows from %d-batch file in %s (%.0f rows/s)", totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) } +func assertArrowIPCFileGeometry(t *testing.T, env *ingestEnv, namePrefix string, totalRows int) { + t.Helper() + lastName, lastPoint := geometryBatchRow(namePrefix, totalRows-1) + values, srids := scanGeometryValuesWithSRID(t, env.pgConn.QueryRow(fmt.Sprintf(` + SELECT %s + FROM events + WHERE name = $1 + `, geometrySelectList(true)), lastName)) + assert.Equal(t, geometryExpected(pointWKT(lastPoint), coord(lastPoint.x), coord(lastPoint.y)), values) + assert.Equal(t, geometrySRIDExpected(), srids) + assertGeometryReadThroughHugr(t, env.service, env.dsName, fmt.Sprintf(`filter: { name: { eq: "%s" } }`, lastName), []map[string]any{ + geometryReadExpected(lastName, lastPoint, lastPoint.x, lastPoint.y), + }) +} + // TestIngest_Postgres_ArrowIPCFile_NotFound checks that a missing file // surfaces a clean error without touching the server. func TestIngest_Postgres_ArrowIPCFile_NotFound(t *testing.T) { @@ -1552,24 +1885,20 @@ func TestIngest_Postgres_LazyReader(t *testing.T) { } rb := array.NewRecordBuilder(pool, schema) defer rb.Release() - names := rb.Field(0).(*array.StringBuilder) - values := rb.Field(1).(*array.Float64Builder) - active := rb.Field(2).(*array.BooleanBuilder) - payloads := rb.Field(3).(*array.StringBuilder) - ts := rb.Field(4).(*array.TimestampBuilder) + fields := eventsRecordBuildersFor(rb) for i := 0; i < rowsPerBatch; i++ { row := batchIdx*rowsPerBatch + i - names.Append(fmt.Sprintf("lz-%06d", row)) - values.Append(float64(row) * 0.5) - active.Append(row%2 == 0) + fields.names.Append(fmt.Sprintf("lz-%06d", row)) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) if row%5 == 0 { - payloads.AppendNull() + fields.payloads.AppendNull() } else { - payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) } - ts.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) } - rec := rb.NewRecord() + rec := rb.NewRecordBatch() batchIdx++ return rec, nil }) @@ -1600,8 +1929,8 @@ func TestIngest_LazyReader_Termination(t *testing.T) { mk := func(v int32) arrow.RecordBatch { b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.Int32Builder).Append(v) - return b.NewRecord() + recordFieldBuilder(t, b, "x").(*array.Int32Builder).Append(v) + return b.NewRecordBatch() } // Case 1: gen returns batches then nil — clean end-of-stream. @@ -1656,9 +1985,9 @@ func TestIngest_Postgres_UnknownColumn(t *testing.T) { }, nil) b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.StringBuilder).AppendValues([]string{"x"}, nil) - b.Field(1).(*array.Int32Builder).AppendValues([]int32{1}, nil) - rec := b.NewRecord() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues([]string{"x"}, nil) + recordFieldBuilder(t, b, "not_a_column").(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecordBatch() defer rec.Release() _, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) @@ -1679,8 +2008,8 @@ func TestIngest_Postgres_UnknownDataObject(t *testing.T) { }, nil) b := array.NewRecordBuilder(pool, schema) defer b.Release() - b.Field(0).(*array.Int32Builder).AppendValues([]int32{1}, nil) - rec := b.NewRecord() + recordFieldBuilder(t, b, "x").(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecordBatch() defer rec.Release() _, err := env.client.IngestRecord(context.Background(), "pg_ingest.does_not_exist", rec) @@ -1730,10 +2059,10 @@ func TestIngest_HTTP_Direct(t *testing.T) { {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, }, nil) bld := array.NewRecordBuilder(pool, schema) - bld.Field(0).(*array.StringBuilder).AppendValues([]string{"direct"}, nil) - bld.Field(1).(*array.Float64Builder).AppendValues([]float64{42}, nil) - bld.Field(2).(*array.BooleanBuilder).AppendValues([]bool{true}, nil) - rec := bld.NewRecord() + recordFieldBuilder(t, bld, "name").(*array.StringBuilder).AppendValues([]string{"direct"}, nil) + recordFieldBuilder(t, bld, "value").(*array.Float64Builder).AppendValues([]float64{42}, nil) + recordFieldBuilder(t, bld, "is_active").(*array.BooleanBuilder).AppendValues([]bool{true}, nil) + rec := bld.NewRecordBatch() bld.Release() defer rec.Release() @@ -1783,24 +2112,20 @@ func TestIngest_HTTP_Direct(t *testing.T) { var streamErr error for batchIdx := 0; batchIdx < numBatches; batchIdx++ { rb := array.NewRecordBuilder(pool, bulkSchema) - names := rb.Field(0).(*array.StringBuilder) - values := rb.Field(1).(*array.Float64Builder) - active := rb.Field(2).(*array.BooleanBuilder) - payloads := rb.Field(3).(*array.StringBuilder) - ts := rb.Field(4).(*array.TimestampBuilder) + fields := eventsRecordBuildersFor(rb) for i := 0; i < rowsPerBatch; i++ { row := batchIdx*rowsPerBatch + i - names.Append(fmt.Sprintf("evt-%06d", row)) - values.Append(float64(row) * 0.5) - active.Append(row%2 == 0) + fields.names.Append(fmt.Sprintf("evt-%06d", row)) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) if row%5 == 0 { - payloads.AppendNull() + fields.payloads.AppendNull() } else { - payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) } - ts.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) } - batchRec := rb.NewRecord() + batchRec := rb.NewRecordBatch() rb.Release() if werr := w.Write(batchRec); werr != nil { streamErr = fmt.Errorf("write batch %d: %w", batchIdx, werr) @@ -1880,7 +2205,10 @@ func TestIngest_HTTP_Direct(t *testing.T) { func TestIngest_HTTP_GeometryTypes(t *testing.T) { env := setupEnv(t) - rec, schema := makeGeometryTypesRecord(t, []string{"geo-a", "geo-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + rec, schema := makeGeometryTypesRecord(t, []geometryTypesRow{ + {name: "geo-a", value: 1, active: true, point: xyPoint{x: 30.5, y: 50.25}, shapeOrigin: xyPoint{x: 0, y: 0}}, + {name: "geo-b", value: 2, active: true, point: xyPoint{x: -73.935242, y: 40.730610}, shapeOrigin: xyPoint{x: 1, y: 1}}, + }) defer rec.Release() var buf bytes.Buffer @@ -1900,42 +2228,20 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { assert.Equal(t, int64(2), out.Inserted) assert.ElementsMatch(t, geometryTypesColumns(), out.Columns) - rows, err := env.pgConn.Query(` + rows, err := env.pgConn.Query(fmt.Sprintf(` SELECT name, - ST_AsText(geom), ST_SRID(geom), - ST_AsText(geom_4326), ST_SRID(geom_4326), - ST_AsText(geom_wkt), ST_SRID(geom_wkt), - ST_AsText(geom_wkt_4326), ST_SRID(geom_wkt_4326), - ST_AsText(geom_geojson), ST_SRID(geom_geojson), - ST_AsText(geom_hugr_geojson), ST_SRID(geom_hugr_geojson), - ST_AsText(geom_plain_geojson), ST_SRID(geom_plain_geojson), - ST_AsText(geom_wkb), ST_SRID(geom_wkb), - ST_AsText(geom_line), ST_SRID(geom_line), - ST_AsText(geom_polygon_native), ST_SRID(geom_polygon_native), - ST_AsText(geom_multipoint), ST_SRID(geom_multipoint), - ST_AsText(geom_multiline), ST_SRID(geom_multiline), - ST_AsText(geom_multipolygon), ST_SRID(geom_multipolygon) + %s FROM events - WHERE name LIKE 'geo-%' + WHERE name LIKE 'geo-%%' ORDER BY name - `) + `, geometrySelectList(true))) require.NoError(t, err) defer rows.Close() got := map[string][]string{} gotSRID := map[string][]int{} for rows.Next() { - var name string - values := make([]string, 13) - srids := make([]int, 13) - scanArgs := []any{&name} - for i := range values { - scanArgs = append(scanArgs, &values[i], &srids[i]) - } - require.NoError(t, rows.Scan(scanArgs...)) - for i := range values { - values[i] = compactWKT(values[i]) - } + name, values, srids := scanNamedGeometryValuesWithSRID(t, rows) got[name] = values gotSRID[name] = srids } @@ -1953,7 +2259,10 @@ func TestIngest_HTTP_GeometryTypes(t *testing.T) { func TestIngest_HTTP_GeometryTypes_ReadThroughHugr(t *testing.T) { env := setupEnv(t) - rec, schema := makeGeometryTypesRecord(t, []string{"geo-read-a", "geo-read-b"}, [][2]float64{{30.5, 50.25}, {-73.935242, 40.730610}}) + rec, schema := makeGeometryTypesRecord(t, []geometryTypesRow{ + {name: "geo-read-a", value: 1, active: true, point: xyPoint{x: 30.5, y: 50.25}, shapeOrigin: xyPoint{x: 0, y: 0}}, + {name: "geo-read-b", value: 2, active: true, point: xyPoint{x: -73.935242, y: 40.730610}, shapeOrigin: xyPoint{x: 1, y: 1}}, + }) defer rec.Release() var buf bytes.Buffer @@ -1969,8 +2278,8 @@ func TestIngest_HTTP_GeometryTypes_ReadThroughHugr(t *testing.T) { require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { like: "geo-read-%" } }`, []map[string]any{ - geometryReadExpected("geo-read-a", [2]float64{30.5, 50.25}, 0, 0), - geometryReadExpected("geo-read-b", [2]float64{-73.935242, 40.730610}, 1, 1), + geometryReadExpected("geo-read-a", xyPoint{x: 30.5, y: 50.25}, 0, 0), + geometryReadExpected("geo-read-b", xyPoint{x: -73.935242, y: 40.730610}, 1, 1), }) } @@ -1993,7 +2302,7 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { w := ipc.NewWriter(pw, ipc.WithSchema(schema)) var streamErr error for batchIdx := 0; batchIdx < numBatches; batchIdx++ { - rec := buildGeometryTypesBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix) + rec := buildGeometryTypesBatch(t, pool, schema, batchIdx, rowsPerBatch, namePrefix) if err := w.Write(rec); err != nil { streamErr = fmt.Errorf("write geometry batch %d: %w", batchIdx, err) rec.Release() @@ -2026,38 +2335,15 @@ func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'pg-geo-bulk-%'").Scan(&count)) assert.Equal(t, totalRows, count) - values := make([]string, 13) - srids := make([]int, 13) - require.NoError(t, env.pgConn.QueryRow(` - SELECT ST_AsText(geom), ST_SRID(geom), - ST_AsText(geom_4326), ST_SRID(geom_4326), - ST_AsText(geom_wkt), ST_SRID(geom_wkt), - ST_AsText(geom_wkt_4326), ST_SRID(geom_wkt_4326), - ST_AsText(geom_geojson), ST_SRID(geom_geojson), - ST_AsText(geom_hugr_geojson), ST_SRID(geom_hugr_geojson), - ST_AsText(geom_plain_geojson), ST_SRID(geom_plain_geojson), - ST_AsText(geom_wkb), ST_SRID(geom_wkb), - ST_AsText(geom_line), ST_SRID(geom_line), - ST_AsText(geom_polygon_native), ST_SRID(geom_polygon_native), - ST_AsText(geom_multipoint), ST_SRID(geom_multipoint), - ST_AsText(geom_multiline), ST_SRID(geom_multiline), - ST_AsText(geom_multipolygon), ST_SRID(geom_multipolygon) + values, srids := scanGeometryValuesWithSRID(t, env.pgConn.QueryRow(fmt.Sprintf(` + SELECT %s FROM events WHERE name = 'pg-geo-bulk-049999' - `).Scan( - &values[0], &srids[0], &values[1], &srids[1], &values[2], &srids[2], - &values[3], &srids[3], &values[4], &srids[4], &values[5], &srids[5], - &values[6], &srids[6], &values[7], &srids[7], &values[8], &srids[8], - &values[9], &srids[9], &values[10], &srids[10], &values[11], &srids[11], - &values[12], &srids[12], - )) - for i := range values { - values[i] = compactWKT(values[i]) - } + `, geometrySelectList(true)))) assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) assert.Equal(t, geometrySRIDExpected(), srids) assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { eq: "pg-geo-bulk-049999" } }`, []map[string]any{ - geometryReadExpected("pg-geo-bulk-049999", [2]float64{99, 49}, 99, 49), + geometryReadExpected("pg-geo-bulk-049999", xyPoint{x: 99, y: 49}, 99, 49), }) elapsed := time.Since(start) @@ -2114,24 +2400,20 @@ func (r *lazyEventsReader) Next() bool { } rb := array.NewRecordBuilder(r.pool, r.schema) defer rb.Release() - names := rb.Field(0).(*array.StringBuilder) - values := rb.Field(1).(*array.Float64Builder) - active := rb.Field(2).(*array.BooleanBuilder) - payloads := rb.Field(3).(*array.StringBuilder) - ts := rb.Field(4).(*array.TimestampBuilder) + fields := eventsRecordBuildersFor(rb) for i := 0; i < r.rowsPerBatch; i++ { row := r.batchIdx*r.rowsPerBatch + i - names.Append(fmt.Sprintf("evt-%06d", row)) - values.Append(float64(row) * 0.5) - active.Append(row%2 == 0) + fields.names.Append(fmt.Sprintf("evt-%06d", row)) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) if row%5 == 0 { - payloads.AppendNull() + fields.payloads.AppendNull() } else { - payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) } - ts.Append(arrow.Timestamp(r.base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + fields.createdAt.Append(arrow.Timestamp(r.base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) } - r.current = rb.NewRecord() + r.current = rb.NewRecordBatch() r.batchIdx++ return true } diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql index dfd884a9..661f25fc 100644 --- a/integration-test/ingest-postgres/testdata/init.sql +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -26,6 +26,7 @@ CREATE TABLE events ( payload_map JSONB, payload_scalar JSONB, payload_arrow_json JSONB, + payload_geo_point JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), geom GEOMETRY(Point, 0), geom_4326 GEOMETRY(Point, 4326), @@ -34,7 +35,10 @@ CREATE TABLE events ( geom_geojson GEOMETRY(Polygon, 0), geom_hugr_geojson GEOMETRY(Polygon, 0), geom_plain_geojson GEOMETRY(Polygon, 0), + geom_geojson_struct GEOMETRY(Polygon, 0), + geom_geojson_arrow_json GEOMETRY(Polygon, 0), geom_wkb GEOMETRY(Point, 0), + geom_hexwkb GEOMETRY(Point, 0), geom_line GEOMETRY(LineString, 0), geom_polygon_native GEOMETRY(Polygon, 0), geom_multipoint GEOMETRY(MultiPoint, 0), diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql index 3110c874..43096fa6 100644 --- a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -19,6 +19,7 @@ type events @table(name: "events") { payload_map: JSON payload_scalar: JSON payload_arrow_json: JSON + payload_geo_point: JSON created_at: Timestamp @default(value: "now()") geom: Geometry @geometry_info(srid: 0, type: POINT) geom_4326: Geometry @geometry_info(srid: 4326, type: POINT) @@ -27,7 +28,10 @@ type events @table(name: "events") { geom_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) geom_hugr_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) geom_plain_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_geojson_struct: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_geojson_arrow_json: Geometry @geometry_info(srid: 0, type: POLYGON) geom_wkb: Geometry @geometry_info(srid: 0, type: POINT) + geom_hexwkb: Geometry @geometry_info(srid: 0, type: POINT) geom_line: Geometry @geometry_info(srid: 0, type: LINESTRING) geom_polygon_native: Geometry @geometry_info(srid: 0, type: POLYGON) geom_multipoint: Geometry @geometry_info(srid: 0, type: MULTIPOINT) diff --git a/pkg/db/arrow_ingest_source.go b/pkg/db/arrow_ingest_source.go index ea3b6320..f0cf1473 100644 --- a/pkg/db/arrow_ingest_source.go +++ b/pkg/db/arrow_ingest_source.go @@ -64,5 +64,9 @@ func (s ArrowIngestSource) RegisterView(arrowConn interface { func isGeometryArrowExtension(ext string) bool { ext = strings.ToLower(ext) - return strings.HasPrefix(ext, "geoarrow.") || ext == "hugr.geojson" || ext == "geojson" + return strings.HasPrefix(ext, "geoarrow.") || + ext == "hugr.geojson" || + ext == "geojson" || + ext == "hugr.hexwkb" || + ext == "hexwkb" } diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go index 135825af..083cbc87 100644 --- a/pkg/engines/arrow_ingest.go +++ b/pkg/engines/arrow_ingest.go @@ -38,7 +38,7 @@ func (b *ArrowIngestStagingBuilder) SelectExpr(field *ast.Field, arrowField arro } switch field.Definition.Type.Name() { case base.JSONTypeName: - return arrowIngestJSONStagingExpr(arrowField, sourceExpr), nil + return arrowIngestJSONStagingExpr(arrowField, sourceExpr) case base.GeometryTypeName: return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) default: @@ -69,66 +69,195 @@ func (b *ArrowIngestStagingBuilder) LiteralExpr(field *ast.Field, value any) (st return b.duckdb.SQLValue(value) } -func arrowIngestJSONStagingExpr(arrowField arrow.Field, sourceExpr string) string { - switch arrowField.Type.ID() { - case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: - return "CAST(" + sourceExpr + " AS JSON)" - case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: - return "CAST(decode(" + sourceExpr + ") AS JSON)" - case arrow.STRUCT, arrow.LIST, arrow.LARGE_LIST, arrow.FIXED_SIZE_LIST, - arrow.LIST_VIEW, arrow.LARGE_LIST_VIEW, arrow.MAP: - return "to_json(" + sourceExpr + ")" +const ( + arrowJSONExtension = "arrow.json" + + hugrGeoJSONExtension = "hugr.geojson" + geoArrowGeoJSONExtension = "geoarrow.geojson" + plainGeoJSONExtension = "geojson" + hugrHexWKBExtension = "hugr.hexwkb" + geoArrowHexWKBExtension = "geoarrow.hexwkb" + plainHexWKBExtension = "hexwkb" + + geoArrowWKBExtension = "geoarrow.wkb" + geoArrowWKTExtension = "geoarrow.wkt" + geoArrowPointExtension = "geoarrow.point" + geoArrowLineStringExtension = "geoarrow.linestring" + geoArrowPolygonExtension = "geoarrow.polygon" + geoArrowMultiPointExtension = "geoarrow.multipoint" + geoArrowMultiLineStringExtension = "geoarrow.multilinestring" + geoArrowMultiPolygonExtension = "geoarrow.multipolygon" + geoArrowGeometryExtension = "geoarrow.geometry" + geoArrowGeometryCollectionExtension = "geoarrow.geometrycollection" +) + +func arrowIngestJSONStagingExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + ext := arrowExtensionNameFromTypeOrMetadata(arrowField) + switch { + case ext == "": + return jsonExprFromPlainArrow(arrowField, sourceExpr), nil + case ext == arrowJSONExtension: + return jsonExprFromArrowJSONExtension(arrowField, sourceExpr) + case isGeoJSONExtension(ext): + return jsonExprFromGeoJSONExtension(arrowField, sourceExpr) + case needsGeometryToJSON(ext): + geomExpr, err := geometryExprFromExtension(ext, arrowField, sourceExpr) + if err != nil { + return "", err + } + return jsonExprFromGeometryExpr(geomExpr), nil + default: + return "", fmt.Errorf("unsupported Arrow extension %q for JSON ingest", ext) + } +} + +func jsonExprFromPlainArrow(arrowField arrow.Field, sourceExpr string) string { + if expr, ok := jsonExprFromSerializedStorage(arrowField, sourceExpr); ok { + return expr + } + return duckDBToJSON(sourceExpr) +} + +func jsonExprFromArrowJSONExtension(arrowField arrow.Field, sourceExpr string) (string, error) { + if expr, ok := jsonExprFromSerializedStorage(arrowField, sourceExpr); ok { + return expr, nil + } + return "", storageError(arrowField, arrowJSONExtension) +} + +func jsonExprFromGeoJSONExtension(arrowField arrow.Field, sourceExpr string) (string, error) { + if expr, ok := jsonExprFromSerializedStorage(arrowField, sourceExpr); ok { + return expr, nil + } + if isArrowObjectStorage(arrowStorageTypeID(arrowField.Type)) { + return duckDBToJSON(sourceExpr), nil + } + return "", storageError(arrowField, "GeoJSON") +} + +func jsonExprFromSerializedStorage(arrowField arrow.Field, sourceExpr string) (string, bool) { + storage := arrowStorageTypeID(arrowField.Type) + switch { + case isArrowStringStorage(storage): + return "CAST(" + sourceExpr + " AS JSON)", true + case isArrowBinaryStorage(storage): + return "CAST(decode(" + sourceExpr + ") AS JSON)", true default: - return "to_json(" + sourceExpr + ")" + return "", false } } +func jsonExprFromGeometryExpr(geometryExpr string) string { + return "CAST(ST_AsGeoJSON(" + geometryExpr + ") AS JSON)" +} + +func duckDBToJSON(sql string) string { + return "to_json(" + sql + ")" +} + func arrowIngestGeometryStagingExpr(arrowField arrow.Field, sourceExpr string) (string, error) { - if ext := arrowExtensionName(arrowField); ext != "" { - return arrowIngestGeometryStagingExprFromTrustedExtension(ext, sourceExpr) + ext := arrowExtensionNameFromTypeOrMetadata(arrowField) + if ext == "" { + return geometryExprFromPlainArrow(arrowField, sourceExpr) } - return arrowIngestGeometryStagingExprFromPhysicalType(arrowField, sourceExpr) + return geometryExprFromExtension(ext, arrowField, sourceExpr) } -// arrowIngestGeometryStagingExprFromTrustedExtension uses GeoArrow/Hugr extension -// metadata as the source of truth for geometry semantics. The physical Arrow -// storage type is intentionally not used as a fallback once extension metadata -// is present; unsupported metadata should fail during planning instead of being -// guessed from Type.ID(). -func arrowIngestGeometryStagingExprFromTrustedExtension(ext, sourceExpr string) (string, error) { - switch ext { - case "geoarrow.wkb": - return sourceExpr, nil - case "geoarrow.wkt": - return "ST_GeomFromText(" + sourceExpr + ", true)", nil - case "hugr.geojson", "geoarrow.geojson", "geojson": - return "ST_GeomFromGeoJSON(" + sourceExpr + ")", nil - case "geoarrow.linestring", "geoarrow.polygon", - "geoarrow.multipoint", "geoarrow.multilinestring", "geoarrow.multipolygon", - "geoarrow.point", "geoarrow.geometry", "geoarrow.geometrycollection": - return arrowIngestGeoArrowNativeGeometryStagingExpr(ext, sourceExpr) +// geometryExprFromExtension uses GeoArrow/Hugr extension metadata as the source +// of truth. The physical Arrow storage type is only validated inside the +// selected extension handler; unsupported metadata never falls back to guessing. +func geometryExprFromExtension(ext string, arrowField arrow.Field, sourceExpr string) (string, error) { + switch { + case ext == geoArrowWKBExtension: + return geometryExprFromGeoArrowWKB(arrowField, sourceExpr) + case isHexWKBExtension(ext): + return geometryExprFromHexWKB(arrowField, sourceExpr) + case ext == geoArrowWKTExtension: + return geometryExprFromWKT(arrowField, sourceExpr) + case isGeoJSONExtension(ext): + return geometryExprFromGeoJSON(arrowField, sourceExpr) + case ext == arrowJSONExtension: + return geometryExprFromArrowJSON(arrowField, sourceExpr) + case isGeoArrowCoordinateExtension(ext): + return geometryExprFromGeoArrowCoordinates(ext, sourceExpr) default: return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) } } -// arrowIngestGeometryStagingExprFromPhysicalType is the best-effort path for -// unannotated Arrow columns. Without extension metadata we infer common -// geometry encodings from physical Arrow storage. -func arrowIngestGeometryStagingExprFromPhysicalType(arrowField arrow.Field, sourceExpr string) (string, error) { - switch arrowField.Type.ID() { - case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW, arrow.FIXED_SIZE_BINARY: +func geometryExprFromPlainArrow(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + switch { + case isArrowBinaryStorage(storage) || storage == arrow.FIXED_SIZE_BINARY: return "ST_GeomFromWKB(" + sourceExpr + ")", nil - case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: - return "CASE WHEN starts_with(trim(" + sourceExpr + "), '{') THEN ST_GeomFromGeoJSON(" + sourceExpr + ") ELSE ST_GeomFromText(" + sourceExpr + ", true) END", nil - case arrow.STRUCT, arrow.MAP: - return "ST_GeomFromGeoJSON(to_json(" + sourceExpr + ")::VARCHAR)", nil + case isArrowStringStorage(storage): + return "ST_GeomFromText(" + sourceExpr + ", true)", nil + case isArrowObjectStorage(storage): + return "ST_GeomFromGeoJSON(" + duckDBJSONAsVarchar(sourceExpr) + ")", nil default: return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Geometry without geoarrow/hugr metadata", arrowField.Name, arrowField.Type) } } -func arrowExtensionName(field arrow.Field) string { +func geometryExprFromGeoArrowWKB(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + if isArrowBinaryStorage(storage) || storage == arrow.FIXED_SIZE_BINARY { + return sourceExpr, nil + } + return "", storageError(arrowField, geoArrowWKBExtension) +} + +func geometryExprFromHexWKB(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + if isArrowStringStorage(storage) { + return "ST_GeomFromWKB(from_hex(" + sourceExpr + "))", nil + } + return "", storageError(arrowField, "hexwkb") +} + +func geometryExprFromWKT(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + if isArrowStringStorage(storage) { + return "ST_GeomFromText(" + sourceExpr + ", true)", nil + } + return "", storageError(arrowField, geoArrowWKTExtension) +} + +func geometryExprFromGeoJSON(arrowField arrow.Field, sourceExpr string) (string, error) { + textExpr, err := geoJSONTextExpr(arrowField, sourceExpr) + if err != nil { + return "", err + } + return "ST_GeomFromGeoJSON(" + textExpr + ")", nil +} + +func geometryExprFromArrowJSON(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + if isArrowStringStorage(storage) { + return "ST_GeomFromGeoJSON(CAST(" + sourceExpr + " AS VARCHAR))", nil + } + return "", storageError(arrowField, arrowJSONExtension) +} + +func geoJSONTextExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + switch { + case isArrowStringStorage(storage): + return sourceExpr, nil + case isArrowBinaryStorage(storage): + return "CAST(decode(" + sourceExpr + ") AS VARCHAR)", nil + case isArrowObjectStorage(storage): + return duckDBJSONAsVarchar(sourceExpr), nil + default: + return "", storageError(arrowField, "GeoJSON") + } +} + +func duckDBJSONAsVarchar(sql string) string { + return duckDBToJSON(sql) + "::VARCHAR" +} + +func arrowExtensionNameFromTypeOrMetadata(field arrow.Field) string { if extType, ok := field.Type.(arrow.ExtensionType); ok { return strings.ToLower(extType.ExtensionName()) } @@ -141,47 +270,126 @@ func arrowExtensionName(field arrow.Field) string { return "" } -func arrowIngestGeoArrowPointGeometryStagingExpr(sql string) string { +func arrowStorageTypeID(dt arrow.DataType) arrow.Type { + if extType, ok := dt.(arrow.ExtensionType); ok { + return extType.StorageType().ID() + } + return dt.ID() +} + +func isArrowStringStorage(storage arrow.Type) bool { + switch storage { + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: + return true + default: + return false + } +} + +func isArrowBinaryStorage(storage arrow.Type) bool { + switch storage { + case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: + return true + default: + return false + } +} + +func isArrowObjectStorage(storage arrow.Type) bool { + switch storage { + case arrow.STRUCT, arrow.MAP: + return true + default: + return false + } +} + +func isGeoJSONExtension(ext string) bool { + switch ext { + case hugrGeoJSONExtension, geoArrowGeoJSONExtension, plainGeoJSONExtension: + return true + default: + return false + } +} + +func isHexWKBExtension(ext string) bool { + switch ext { + case hugrHexWKBExtension, geoArrowHexWKBExtension, plainHexWKBExtension: + return true + default: + return false + } +} + +func needsGeometryToJSON(ext string) bool { + return ext == geoArrowWKBExtension || + ext == geoArrowWKTExtension || + isHexWKBExtension(ext) || + isGeoArrowCoordinateExtension(ext) +} + +func isGeoArrowCoordinateExtension(ext string) bool { + switch ext { + case geoArrowPointExtension, + geoArrowLineStringExtension, + geoArrowPolygonExtension, + geoArrowMultiPointExtension, + geoArrowMultiLineStringExtension, + geoArrowMultiPolygonExtension, + geoArrowGeometryExtension, + geoArrowGeometryCollectionExtension: + return true + default: + return false + } +} + +func storageError(arrowField arrow.Field, format string) error { + return fmt.Errorf("arrow column %q with type %s cannot use %s storage", arrowField.Name, arrowField.Type, format) +} + +func geoArrowPointGeometryExpr(sql string) string { return "ST_Point(struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" } -func arrowIngestGeoArrowLineStringGeometryStagingExpr(sql string) string { - return "ST_MakeLine(list_transform(" + sql + ", lambda _p: " + arrowIngestGeoArrowPointGeometryStagingExpr("_p") + "))" +func geoArrowLineStringGeometryExpr(sql string) string { + return "ST_MakeLine(list_transform(" + sql + ", lambda _p: " + geoArrowPointGeometryExpr("_p") + "))" } -func arrowIngestGeoArrowPolygonGeometryStagingExpr(sql string) string { - shell := arrowIngestGeoArrowLineStringGeometryStagingExpr(sql + "[1]") - holes := "list_transform(" + sql + "[2:], lambda _r: " + arrowIngestGeoArrowLineStringGeometryStagingExpr("_r") + ")" +func geoArrowPolygonGeometryExpr(sql string) string { + shell := geoArrowLineStringGeometryExpr(sql + "[1]") + holes := "list_transform(" + sql + "[2:], lambda _r: " + geoArrowLineStringGeometryExpr("_r") + ")" return "ST_MakePolygon(" + shell + ", " + holes + ")" } -func arrowIngestGeoArrowMultiPointGeometryStagingExpr(sql string) string { - return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _p: " + arrowIngestGeoArrowPointGeometryStagingExpr("_p") + ")))" +func geoArrowMultiPointGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _p: " + geoArrowPointGeometryExpr("_p") + ")))" } -func arrowIngestGeoArrowMultiLineStringGeometryStagingExpr(sql string) string { - return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _ls: " + arrowIngestGeoArrowLineStringGeometryStagingExpr("_ls") + ")))" +func geoArrowMultiLineStringGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _ls: " + geoArrowLineStringGeometryExpr("_ls") + ")))" } -func arrowIngestGeoArrowMultiPolygonGeometryStagingExpr(sql string) string { - return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _poly: " + arrowIngestGeoArrowPolygonGeometryStagingExpr("_poly") + ")))" +func geoArrowMultiPolygonGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _poly: " + geoArrowPolygonGeometryExpr("_poly") + ")))" } -func arrowIngestGeoArrowNativeGeometryStagingExpr(ext, sql string) (string, error) { +func geometryExprFromGeoArrowCoordinates(ext, sql string) (string, error) { switch ext { - case "geoarrow.point": - return arrowIngestGeoArrowPointGeometryStagingExpr(sql), nil - case "geoarrow.linestring": - return arrowIngestGeoArrowLineStringGeometryStagingExpr(sql), nil - case "geoarrow.polygon": - return arrowIngestGeoArrowPolygonGeometryStagingExpr(sql), nil - case "geoarrow.multipoint": - return arrowIngestGeoArrowMultiPointGeometryStagingExpr(sql), nil - case "geoarrow.multilinestring": - return arrowIngestGeoArrowMultiLineStringGeometryStagingExpr(sql), nil - case "geoarrow.multipolygon": - return arrowIngestGeoArrowMultiPolygonGeometryStagingExpr(sql), nil - case "geoarrow.geometry", "geoarrow.geometrycollection": + case geoArrowPointExtension: + return geoArrowPointGeometryExpr(sql), nil + case geoArrowLineStringExtension: + return geoArrowLineStringGeometryExpr(sql), nil + case geoArrowPolygonExtension: + return geoArrowPolygonGeometryExpr(sql), nil + case geoArrowMultiPointExtension: + return geoArrowMultiPointGeometryExpr(sql), nil + case geoArrowMultiLineStringExtension: + return geoArrowMultiLineStringGeometryExpr(sql), nil + case geoArrowMultiPolygonExtension: + return geoArrowMultiPolygonGeometryExpr(sql), nil + case geoArrowGeometryExtension, geoArrowGeometryCollectionExtension: return "", fmt.Errorf("%s ingest is not supported from native union storage; send geoarrow.wkb, geoarrow.wkt, geoarrow.geojson, or a concrete GeoArrow coordinate layout", ext) default: return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go index 2e8df3ec..1e3160ef 100644 --- a/pkg/engines/arrow_ingest_test.go +++ b/pkg/engines/arrow_ingest_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/extensions" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/paulmach/orb" "github.com/vektah/gqlparser/v2/ast" @@ -14,6 +15,7 @@ func TestArrowIngestJSONStagingExpr(t *testing.T) { tests := []struct { name string typ arrow.DataType + ext string want string }{ {name: "string", typ: arrow.BinaryTypes.String, want: "CAST(payload AS JSON)"}, @@ -30,11 +32,24 @@ func TestArrowIngestJSONStagingExpr(t *testing.T) { {name: "large list view", typ: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, {name: "map", typ: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, {name: "scalar", typ: arrow.PrimitiveTypes.Int64, want: "to_json(payload)"}, + {name: "arrow json extension", typ: mustTestArrowJSONType(t), want: "CAST(payload AS JSON)"}, + {name: "geojson string extension", typ: arrow.BinaryTypes.String, ext: "geoarrow.geojson", want: "CAST(payload AS JSON)"}, + {name: "geojson struct extension", typ: arrow.StructOf(arrow.Field{Name: "type", Type: arrow.BinaryTypes.String}), ext: "geoarrow.geojson", want: "to_json(payload)"}, + {name: "geo wkt extension", typ: arrow.BinaryTypes.String, ext: "geoarrow.wkt", want: "CAST(ST_AsGeoJSON(ST_GeomFromText(payload, true)) AS JSON)"}, + {name: "geo hex wkb extension", typ: arrow.BinaryTypes.String, ext: "hugr.hexwkb", want: "CAST(ST_AsGeoJSON(ST_GeomFromWKB(from_hex(payload))) AS JSON)"}, + {name: "native geoarrow point extension", typ: geoArrowTestType("geoarrow.point"), ext: "geoarrow.point", want: "CAST(ST_AsGeoJSON(ST_Point(struct_extract(payload, 'x'), struct_extract(payload, 'y'))) AS JSON)"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := arrowIngestJSONStagingExpr(arrow.Field{Name: "payload", Type: tt.typ}, "payload") + meta := arrow.Metadata{} + if tt.ext != "" { + meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) + } + got, err := arrowIngestJSONStagingExpr(arrow.Field{Name: "payload", Type: tt.typ, Metadata: meta}, "payload") + if err != nil { + t.Fatal(err) + } if got != tt.want { t.Fatalf("got %q, want %q", got, tt.want) } @@ -42,6 +57,20 @@ func TestArrowIngestJSONStagingExpr(t *testing.T) { } } +func TestArrowIngestJSONRejectsUnsupportedExtensionMetadata(t *testing.T) { + _, err := arrowIngestJSONStagingExpr(arrow.Field{ + Name: "payload", + Type: arrow.BinaryTypes.String, + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "hugr.unknown_json"}), + }, "payload") + if err == nil { + t.Fatal("expected unsupported JSON extension to be rejected") + } + if !strings.Contains(err.Error(), `unsupported Arrow extension "hugr.unknown_json" for JSON ingest`) { + t.Fatalf("unexpected error: %v", err) + } +} + func TestArrowIngestStagingBuildsNativeGeoArrowSelectExpr(t *testing.T) { field := geometryTestField("") staging := NewArrowIngestStagingBuilder() @@ -120,15 +149,38 @@ func TestArrowIngestStagingBuildsDirectGeometrySelectExpr(t *testing.T) { ext: "geojson", want: "ST_GeomFromGeoJSON(geom)", }, + { + name: "trusted geojson struct serializes to json text", + typ: arrow.StructOf(arrow.Field{Name: "type", Type: arrow.BinaryTypes.String}), + ext: "geoarrow.geojson", + want: "ST_GeomFromGeoJSON(to_json(geom)::VARCHAR)", + }, + { + name: "unannotated struct serializes to geojson text", + typ: arrow.StructOf(arrow.Field{Name: "type", Type: arrow.BinaryTypes.String}), + want: "ST_GeomFromGeoJSON(to_json(geom)::VARCHAR)", + }, + { + name: "arrow json parses as geojson text", + typ: mustTestArrowJSONType(t), + ext: "arrow.json", + want: "ST_GeomFromGeoJSON(CAST(geom AS VARCHAR))", + }, + { + name: "trusted hex wkb parses through from_hex", + typ: arrow.BinaryTypes.String, + ext: "hugr.hexwkb", + want: "ST_GeomFromWKB(from_hex(geom))", + }, { name: "unannotated binary parses directly as wkb", typ: arrow.BinaryTypes.Binary, want: "ST_GeomFromWKB(geom)", }, { - name: "unannotated string chooses geojson or wkt without text roundtrip", + name: "unannotated string parses as wkt", typ: arrow.BinaryTypes.String, - want: "CASE WHEN starts_with(trim(geom), '{') THEN ST_GeomFromGeoJSON(geom) ELSE ST_GeomFromText(geom, true) END", + want: "ST_GeomFromText(geom, true)", }, } @@ -156,6 +208,15 @@ func TestArrowIngestStagingBuildsDirectGeometrySelectExpr(t *testing.T) { } } +func mustTestArrowJSONType(t *testing.T) arrow.DataType { + t.Helper() + typ, err := extensions.NewJSONType(arrow.BinaryTypes.String) + if err != nil { + t.Fatal(err) + } + return typ +} + func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { field := geometryTestField("") staging := NewArrowIngestStagingBuilder()