diff --git a/.gitignore b/.gitignore index 50268021..027a93ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .DS* .local .claude/ +.idea/ .specify/ specs/ design/ diff --git a/client/ingest.go b/client/ingest.go new file mode 100644 index 00000000..c836bdca --- /dev/null +++ b/client/ingest.go @@ -0,0 +1,346 @@ +package client + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "sync/atomic" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" +) + +// IngestResult is the success payload returned by /ipc/ingest. +type IngestResult struct { + DataObject string `json:"data_object"` + Inserted int64 `json:"inserted"` + Columns []string `json:"columns"` +} + +const ingestContentType = "application/vnd.apache.arrow.stream" + +// arrowFileMagic identifies the Arrow IPC *file* format (random-access), +// distinct from the IPC *stream* format that /ipc/ingest expects on the wire. +var arrowFileMagic = []byte("ARROW1") + +// IngestStream POSTs the given Arrow IPC stream to /ipc/ingest. The body +// must already be a valid Arrow IPC stream (schema message followed by +// record batches) — typically produced by ipc.NewWriter, by another tool, +// or read from a stream-format file. +// +// The body is forwarded to the server without intermediate buffering. Use +// this when the caller already has a serialised stream from disk, the +// network, or another process. Use Ingest for the higher-level API that +// serialises an array.RecordReader for you. +func (c *Client) IngestStream(ctx context.Context, dataObject string, body io.Reader) (*IngestResult, error) { + if dataObject == "" { + return nil, errors.New("hugr ingest: data_object is required") + } + if body == nil { + return nil, errors.New("hugr ingest: body is nil") + } + endpoint, err := buildIngestURL(c.url, dataObject) + if err != nil { + return nil, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, body) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", ingestContentType) + setAsUserHeaders(ctx, req) + resp, err := c.c.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + raw, _ := io.ReadAll(resp.Body) + var ebody struct { + Error string `json:"error"` + } + _ = json.Unmarshal(raw, &ebody) + if ebody.Error == "" { + ebody.Error = strings.TrimSpace(string(raw)) + } + if ebody.Error == "" { + ebody.Error = resp.Status + } + return nil, fmt.Errorf("hugr ingest: %s: %s", resp.Status, ebody.Error) + } + + var out IngestResult + if err := json.NewDecoder(resp.Body).Decode(&out); err != nil { + return nil, fmt.Errorf("decode ingest response: %w", err) + } + return &out, nil +} + +// Ingest streams the records produced by reader into the target data object. +// Columns from the Arrow schema must match insertable fields of the table +// (computed/virtual/reference fields are rejected by the server). +// +// dataObject is either a dotted GraphQL Query path (e.g. "pg_store.public.events") +// or a bare hugr type name. The client serialises the reader as an Apache +// Arrow IPC stream and POSTs it to /ipc/ingest on the configured base URL. +// +// The reader is fully drained on success; on error the caller may inspect +// the reader's remaining state but it should be released by the caller in +// all cases. +func (c *Client) Ingest(ctx context.Context, dataObject string, reader array.RecordReader) (*IngestResult, error) { + if reader == nil { + return nil, errors.New("hugr ingest: reader is nil") + } + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + iw := ipc.NewWriter(pw, ipc.WithSchema(reader.Schema())) + var streamErr error + for reader.Next() { + rec := reader.RecordBatch() + if rec == nil { + continue + } + if err := iw.Write(rec); err != nil { + streamErr = fmt.Errorf("write arrow record: %w", err) + break + } + } + if streamErr == nil { + if err := reader.Err(); err != nil { + streamErr = fmt.Errorf("read arrow record: %w", err) + } + } + if err := iw.Close(); err != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", err) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + res, httpErr := c.IngestStream(ctx, dataObject, pr) + if httpErr != nil { + // Unblock the writer goroutine if the HTTP side aborted early. + _ = pr.CloseWithError(httpErr) + } + if werr := <-writeErr; werr != nil { + // Serialisation errors are more informative than the (likely + // derivative) HTTP error. + return nil, werr + } + return res, httpErr +} + +// IngestRecord is a single-batch convenience wrapper around Ingest. It builds +// an array.RecordReader from a single arrow.RecordBatch and forwards. +func (c *Client) IngestRecord(ctx context.Context, dataObject string, rec arrow.RecordBatch) (*IngestResult, error) { + if rec == nil { + return nil, errors.New("hugr ingest: record is nil") + } + rr, err := array.NewRecordReader(rec.Schema(), []arrow.RecordBatch{rec}) + if err != nil { + return nil, fmt.Errorf("build record reader: %w", err) + } + defer rr.Release() + return c.Ingest(ctx, dataObject, rr) +} + +// IngestArrowIPCFile opens an Arrow IPC file at path and streams its +// contents to /ipc/ingest. Both IPC formats are accepted: +// +// - stream format (no ARROW1 prefix) — written by ipc.NewWriter or +// pyarrow.ipc.new_stream. Bytes are forwarded directly to the server, +// zero-copy. +// - file format (.arrow / .feather, starts with ARROW1 magic) — written +// by ipc.NewFileWriter or pyarrow.feather.write_feather. The file is +// read sequentially via ipc.FileReader and re-emitted as a stream. +func (c *Client) IngestArrowIPCFile(ctx context.Context, dataObject, path string) (*IngestResult, error) { + if path == "" { + return nil, errors.New("hugr ingest: path is required") + } + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("open %s: %w", path, err) + } + defer f.Close() + + // Peek for the ARROW1 magic to decide between stream and file format. + var magic [6]byte + n, err := io.ReadFull(f, magic[:]) + if err != nil && !errors.Is(err, io.ErrUnexpectedEOF) && !errors.Is(err, io.EOF) { + return nil, fmt.Errorf("read %s: %w", path, err) + } + if n == len(arrowFileMagic) && bytes.Equal(magic[:], arrowFileMagic) { + if _, err := f.Seek(0, io.SeekStart); err != nil { + return nil, fmt.Errorf("seek %s: %w", path, err) + } + fr, err := ipc.NewFileReader(f, ipc.WithAllocator(memory.NewGoAllocator())) + if err != nil { + return nil, fmt.Errorf("open arrow ipc file %s: %w", path, err) + } + defer fr.Close() + rr := &fileReaderAsRecordReader{fr: fr} + rr.refCount.Add(1) + defer rr.Release() + return c.Ingest(ctx, dataObject, rr) + } + + // Stream format — forward bytes. Prepend the bytes we already consumed + // during magic detection. + body := io.MultiReader(bytes.NewReader(magic[:n]), f) + return c.IngestStream(ctx, dataObject, body) +} + +// NewLazyReader returns an array.RecordReader that produces batches by +// calling gen. gen should return (batch, nil) for each successive batch and +// (nil, nil) to signal end-of-stream. Returning (_, err) terminates the +// reader; the error is then visible via Err(). +// +// The reader takes ownership of each returned batch and releases it on the +// next Next() call or on the final Release. The caller must not Release +// the batch themselves after returning it from gen. +// +// Typical use: stream bulk data from any source (file, channel, generator) +// into Client.Ingest without implementing the full array.RecordReader +// interface by hand. +func NewLazyReader(schema *arrow.Schema, gen func() (arrow.RecordBatch, error)) array.RecordReader { + r := &lazyReader{schema: schema, gen: gen} + r.refCount.Add(1) + return r +} + +// buildIngestURL derives the /ipc/ingest endpoint from the client's base /ipc URL. +// Accepts both ".../ipc" (canonical) and ".../ipc/" forms. +func buildIngestURL(base, dataObject string) (string, error) { + u, err := url.Parse(base) + if err != nil { + return "", fmt.Errorf("invalid hugr url %q: %w", base, err) + } + path := strings.TrimSuffix(u.Path, "/") + switch { + case strings.HasSuffix(path, "/ipc"): + u.Path = path + "/ingest" + case strings.HasSuffix(path, "/ipc/ingest"): + // already pointed at ingest endpoint — keep as-is + default: + u.Path = path + "/ipc/ingest" + } + q := u.Query() + q.Set("data_object", dataObject) + u.RawQuery = q.Encode() + return u.String(), nil +} + +// --- lazyReader ----------------------------------------------------------- + +type lazyReader struct { + schema *arrow.Schema + gen func() (arrow.RecordBatch, error) + + cur arrow.RecordBatch + err error + done bool + refCount atomic.Int64 +} + +func (r *lazyReader) Schema() *arrow.Schema { return r.schema } +func (r *lazyReader) Err() error { return r.err } + +func (r *lazyReader) Next() bool { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + if r.done || r.err != nil { + return false + } + rec, err := r.gen() + if err != nil { + r.err = err + r.done = true + return false + } + if rec == nil { + r.done = true + return false + } + r.cur = rec + return true +} + +func (r *lazyReader) RecordBatch() arrow.RecordBatch { return r.cur } +func (r *lazyReader) Record() arrow.RecordBatch { return r.cur } +func (r *lazyReader) Retain() { r.refCount.Add(1) } +func (r *lazyReader) Release() { + if r.refCount.Add(-1) == 0 { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + } +} + +// --- fileReaderAsRecordReader --------------------------------------------- + +// fileReaderAsRecordReader adapts an *ipc.FileReader (random-access file +// format) to the array.RecordReader interface required by Ingest. +type fileReaderAsRecordReader struct { + fr *ipc.FileReader + cur arrow.RecordBatch + err error + + refCount atomic.Int64 +} + +func (r *fileReaderAsRecordReader) Schema() *arrow.Schema { return r.fr.Schema() } +func (r *fileReaderAsRecordReader) Err() error { return r.err } + +func (r *fileReaderAsRecordReader) Next() bool { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + if r.err != nil { + return false + } + rec, err := r.fr.Read() + if errors.Is(err, io.EOF) { + return false + } + if err != nil { + r.err = err + return false + } + if rec == nil { + return false + } + // FileReader.Read documents that the record is valid until next Read. + // Retain so we own the reference until our own Next/Release. + rec.Retain() + r.cur = rec + return true +} + +func (r *fileReaderAsRecordReader) RecordBatch() arrow.RecordBatch { return r.cur } +func (r *fileReaderAsRecordReader) Record() arrow.RecordBatch { return r.cur } +func (r *fileReaderAsRecordReader) Retain() { r.refCount.Add(1) } +func (r *fileReaderAsRecordReader) Release() { + if r.refCount.Add(-1) == 0 { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + } +} diff --git a/client/ingest_test.go b/client/ingest_test.go new file mode 100644 index 00000000..5559ad84 --- /dev/null +++ b/client/ingest_test.go @@ -0,0 +1,504 @@ +package client + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "sync/atomic" + "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --- shared helpers ------------------------------------------------------- + +// ingestOKHandler is a server that decodes the incoming Arrow IPC stream, +// counts rows, and answers with a canonical IngestResult. The decoded +// schema's column names are returned in the response so tests can assert +// per-column fidelity. +func ingestOKHandler(t *testing.T, pool memory.Allocator) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + if r.URL.Path != "/ipc/ingest" { + t.Errorf("expected /ipc/ingest, got %s", r.URL.Path) + } + if ct := r.Header.Get("Content-Type"); !strings.HasPrefix(ct, "application/vnd.apache.arrow.stream") { + t.Errorf("unexpected content-type: %s", ct) + } + body, err := io.ReadAll(r.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + rr, err := ipc.NewReader(bytes.NewReader(body), ipc.WithAllocator(pool)) + if err != nil { + t.Fatalf("decode body as arrow stream: %v", err) + } + defer rr.Release() + var rows int64 + var cols []string + for _, f := range rr.Schema().Fields() { + cols = append(cols, f.Name) + } + for rr.Next() { + rows += rr.RecordBatch().NumRows() + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{ + "data_object": r.URL.Query().Get("data_object"), + "inserted": rows, + "columns": cols, + }) + } +} + +// smallRecord builds a single 2-row record with an int32 + string column. +func smallRecord(t *testing.T, pool memory.Allocator) arrow.RecordBatch { + t.Helper() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "id", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: true}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{10, 20}, nil) + b.Field(1).(*array.StringBuilder).AppendValues([]string{"alpha", "beta"}, nil) + return b.NewRecord() +} + +func TestBuildIngestURL(t *testing.T) { + tests := []struct { + name string + base string + dataObject string + want string + }{ + { + name: "canonical /ipc base", + base: "http://localhost:15000/ipc", + dataObject: "pg_store.public.events", + want: "http://localhost:15000/ipc/ingest?data_object=pg_store.public.events", + }, + { + name: "trailing slash on /ipc", + base: "http://localhost:15000/ipc/", + dataObject: "events", + want: "http://localhost:15000/ipc/ingest?data_object=events", + }, + { + name: "base without /ipc", + base: "http://localhost:15000", + dataObject: "events", + want: "http://localhost:15000/ipc/ingest?data_object=events", + }, + { + name: "base already at /ipc/ingest", + base: "http://localhost:15000/ipc/ingest", + dataObject: "events", + want: "http://localhost:15000/ipc/ingest?data_object=events", + }, + { + name: "data_object with special chars is encoded", + base: "http://localhost:15000/ipc", + dataObject: "schema.table with space", + want: "http://localhost:15000/ipc/ingest?data_object=schema.table+with+space", + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, err := buildIngestURL(tc.base, tc.dataObject) + require.NoError(t, err) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestBuildIngestURL_BadBase(t *testing.T) { + _, err := buildIngestURL("://not a url", "events") + require.Error(t, err) +} + +// TestIngest_RoundTrip exercises the full client path against an in-memory +// HTTP server: it verifies the URL, headers, that the body is a valid Arrow +// IPC stream, and that the success response is parsed back into IngestResult. +func TestIngest_RoundTrip(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "id", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: true}, + }, nil) + + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{1, 2, 3}, nil) + b.Field(1).(*array.StringBuilder).AppendValues([]string{"a", "b", "c"}, nil) + rec := b.NewRecord() + defer rec.Release() + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + if r.URL.Path != "/ipc/ingest" { + t.Errorf("expected /ipc/ingest, got %s", r.URL.Path) + } + if got := r.URL.Query().Get("data_object"); got != "ns.mytable" { + t.Errorf("expected data_object=ns.mytable, got %q", got) + } + if ct := r.Header.Get("Content-Type"); !strings.HasPrefix(ct, "application/vnd.apache.arrow.stream") { + t.Errorf("unexpected content-type: %s", ct) + } + + body, err := io.ReadAll(r.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + rr, err := ipc.NewReader(bytes.NewReader(body), ipc.WithAllocator(pool)) + if err != nil { + t.Fatalf("decode body as arrow stream: %v", err) + } + defer rr.Release() + var rows int64 + for rr.Next() { + rows += rr.RecordBatch().NumRows() + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{ + "data_object": r.URL.Query().Get("data_object"), + "inserted": rows, + "columns": []string{"id", "name"}, + }) + })) + t.Cleanup(srv.Close) + + c := NewClient(srv.URL + "/ipc") + res, err := c.IngestRecord(context.Background(), "ns.mytable", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, "ns.mytable", res.DataObject) + assert.Equal(t, int64(3), res.Inserted) + assert.Equal(t, []string{"id", "name"}, res.Columns) +} + +func TestIngest_ServerError(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecord() + defer rec.Release() + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "column foo is not defined"}) + })) + t.Cleanup(srv.Close) + + c := NewClient(srv.URL + "/ipc") + _, err := c.IngestRecord(context.Background(), "ns.x", rec) + require.Error(t, err) + assert.True(t, strings.Contains(err.Error(), "column foo is not defined"), + "error should surface server message, got: %v", err) +} + +func TestIngest_NilReader(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.Ingest(context.Background(), "ns.x", nil) + require.Error(t, err) + assert.True(t, errors.Is(err, err)) +} + +func TestIngest_EmptyDataObject(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestRecord(context.Background(), "", nil) + require.Error(t, err) +} + +func TestIngest_ServerErrorTextBody(t *testing.T) { + // 4xx with a non-JSON body — error message must still be surfaced. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + w.WriteHeader(http.StatusUnsupportedMediaType) + _, _ = w.Write([]byte("Content-Type must be application/vnd.apache.arrow.stream")) + })) + t.Cleanup(srv.Close) + + pool := memory.NewGoAllocator() + rec := smallRecord(t, pool) + defer rec.Release() + c := NewClient(srv.URL + "/ipc") + _, err := c.IngestRecord(context.Background(), "ns.x", rec) + require.Error(t, err) + assert.Contains(t, err.Error(), "Content-Type must be") +} + +func TestIngest_WriterErrorWinsOverHTTP(t *testing.T) { + // reader.Err() returns a non-nil error AFTER yielding one good batch. + // The HTTP side will see EOF / truncated stream and may respond 4xx; + // the client must surface the writer-side error, not the HTTP one. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Drain body to unblock the writer, then respond with a generic 500 + // so we can confirm the client prefers writer error over this. + _, _ = io.Copy(io.Discard, r.Body) + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte("server failed")) + })) + t.Cleanup(srv.Close) + + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + pool := memory.NewGoAllocator() + errBoom := errors.New("reader source explosion") + calls := 0 + reader := NewLazyReader(schema, func() (arrow.RecordBatch, error) { + calls++ + if calls == 1 { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).Append(1) + return b.NewRecord(), nil + } + return nil, errBoom + }) + defer reader.Release() + + c := NewClient(srv.URL + "/ipc") + _, err := c.Ingest(context.Background(), "ns.x", reader) + require.Error(t, err) + assert.Contains(t, err.Error(), "reader source explosion", + "writer-side error must be surfaced, got: %v", err) +} + +// --- IngestStream --------------------------------------------------------- + +func TestIngestStream_Happy(t *testing.T) { + pool := memory.NewGoAllocator() + srv := httptest.NewServer(ingestOKHandler(t, pool)) + t.Cleanup(srv.Close) + + rec := smallRecord(t, pool) + defer rec.Release() + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(rec.Schema())) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + c := NewClient(srv.URL + "/ipc") + res, err := c.IngestStream(context.Background(), "ns.t", &buf) + require.NoError(t, err) + assert.Equal(t, int64(2), res.Inserted) + assert.Equal(t, "ns.t", res.DataObject) + assert.ElementsMatch(t, []string{"id", "name"}, res.Columns) +} + +func TestIngestStream_NilBody(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestStream(context.Background(), "ns.t", nil) + require.Error(t, err) + assert.Contains(t, err.Error(), "body is nil") +} + +func TestIngestStream_EmptyDataObject(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestStream(context.Background(), "", bytes.NewReader(nil)) + require.Error(t, err) + assert.Contains(t, err.Error(), "data_object") +} + +func TestIngestStream_ServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "invalid arrow stream"}) + })) + t.Cleanup(srv.Close) + c := NewClient(srv.URL + "/ipc") + _, err := c.IngestStream(context.Background(), "ns.t", bytes.NewReader([]byte("not arrow"))) + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid arrow stream") +} + +// --- IngestArrowIPCFile --------------------------------------------------- + +func writeArrowStreamFile(t *testing.T, dir string, pool memory.Allocator) (string, *arrow.Schema) { + t.Helper() + rec := smallRecord(t, pool) + defer rec.Release() + path := filepath.Join(dir, "data.arrows") + f, err := os.Create(path) + require.NoError(t, err) + w := ipc.NewWriter(f, ipc.WithSchema(rec.Schema())) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + require.NoError(t, f.Close()) + return path, rec.Schema() +} + +func writeArrowIPCFile(t *testing.T, dir string, pool memory.Allocator) (string, *arrow.Schema) { + t.Helper() + rec := smallRecord(t, pool) + defer rec.Release() + path := filepath.Join(dir, "data.arrow") + f, err := os.Create(path) + require.NoError(t, err) + fw, err := ipc.NewFileWriter(f, ipc.WithSchema(rec.Schema())) + require.NoError(t, err) + require.NoError(t, fw.Write(rec)) + require.NoError(t, fw.Close()) + require.NoError(t, f.Close()) + return path, rec.Schema() +} + +func TestIngestArrowIPCFile_StreamFormat(t *testing.T) { + pool := memory.NewGoAllocator() + srv := httptest.NewServer(ingestOKHandler(t, pool)) + t.Cleanup(srv.Close) + + path, _ := writeArrowStreamFile(t, t.TempDir(), pool) + // Sanity-check: file is *stream* format (no ARROW1 magic). + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.NotEqual(t, "ARROW1", string(head[:6])) + + c := NewClient(srv.URL + "/ipc") + res, err := c.IngestArrowIPCFile(context.Background(), "ns.t", path) + require.NoError(t, err) + assert.Equal(t, int64(2), res.Inserted) +} + +func TestIngestArrowIPCFile_FileFormat(t *testing.T) { + pool := memory.NewGoAllocator() + srv := httptest.NewServer(ingestOKHandler(t, pool)) + t.Cleanup(srv.Close) + + path, _ := writeArrowIPCFile(t, t.TempDir(), pool) + // Sanity-check: file is *file* format (ARROW1 magic). + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.Equal(t, "ARROW1", string(head[:6])) + + c := NewClient(srv.URL + "/ipc") + res, err := c.IngestArrowIPCFile(context.Background(), "ns.t", path) + require.NoError(t, err) + assert.Equal(t, int64(2), res.Inserted) +} + +func TestIngestArrowIPCFile_NotFound(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestArrowIPCFile(context.Background(), "ns.t", + filepath.Join(t.TempDir(), "does-not-exist.arrows")) + require.Error(t, err) +} + +func TestIngestArrowIPCFile_EmptyPath(t *testing.T) { + c := NewClient("http://localhost/ipc") + _, err := c.IngestArrowIPCFile(context.Background(), "ns.t", "") + require.Error(t, err) + assert.Contains(t, err.Error(), "path is required") +} + +// --- NewLazyReader -------------------------------------------------------- + +func TestNewLazyReader_CompletesOnNilNil(t *testing.T) { + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + i := 0 + r := NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i >= 3 { + return nil, nil // signal end-of-stream + } + i++ + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).Append(int32(i)) + return b.NewRecord(), nil + }) + defer r.Release() + + assert.Equal(t, schema, r.Schema()) + seen := 0 + for r.Next() { + require.NotNil(t, r.RecordBatch()) + assert.Equal(t, int64(1), r.RecordBatch().NumRows()) + seen++ + } + require.NoError(t, r.Err()) + assert.Equal(t, 3, seen) + assert.False(t, r.Next(), "Next stays false after end-of-stream") +} + +func TestNewLazyReader_PropagatesError(t *testing.T) { + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + errBoom := errors.New("source failure") + i := 0 + r := NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i == 2 { + return nil, errBoom + } + i++ + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + b.Field(0).(*array.Int32Builder).Append(int32(i)) + return b.NewRecord(), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + assert.Equal(t, 2, seen, "should yield batches before the failing call") + require.Error(t, r.Err()) + assert.ErrorIs(t, r.Err(), errBoom) + assert.False(t, r.Next(), "Next stays false after error") +} + +func TestNewLazyReader_RetainReleaseRefcount(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + r := NewLazyReader(schema, func() (arrow.RecordBatch, error) { return nil, nil }) + // initial refCount = 1 (set by constructor) + rc := refCountOf(t, r) + assert.Equal(t, int64(1), rc.Load()) + r.Retain() + assert.Equal(t, int64(2), rc.Load()) + r.Release() + assert.Equal(t, int64(1), rc.Load()) + r.Release() + assert.Equal(t, int64(0), rc.Load()) +} + +// refCountOf reaches into the concrete *lazyReader to verify retain/release +// semantics. Test-only — the field is unexported on purpose. +func refCountOf(t *testing.T, r array.RecordReader) *atomic.Int64 { + t.Helper() + lr, ok := r.(*lazyReader) + require.True(t, ok, "expected *lazyReader, got %T", r) + return &lr.refCount +} diff --git a/engine.go b/engine.go index b393da80..ad654404 100644 --- a/engine.go +++ b/engine.go @@ -391,6 +391,7 @@ func (s *Service) endpoints() { s.router.Handle("/query", mw(http.HandlerFunc(s.queryHandler))) s.router.Handle("/jq-query", mw(http.HandlerFunc(s.jqHandler))) s.router.Handle("/ipc", mw(http.HandlerFunc(s.ipcHandler))) + s.router.Handle("/ipc/ingest", mw(http.HandlerFunc(s.ipcIngestHandler))) s.router.Handle("/subscribe", mw(http.HandlerFunc(s.subscribeHandler))) // s.router.Handle("/schema", mw(http.HandlerFunc(s.schemaHandler))) // disabled: schemaHandler blocked on gqlparser requiring *ast.Schema diff --git a/integration-test/ingest-duckdb/ingest_duckdb_test.go b/integration-test/ingest-duckdb/ingest_duckdb_test.go new file mode 100644 index 00000000..105a04b4 --- /dev/null +++ b/integration-test/ingest-duckdb/ingest_duckdb_test.go @@ -0,0 +1,2112 @@ +//go:build duckdb_arrow + +package ingest_duckdb_test + +import ( + "bytes" + "context" + "database/sql" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strconv" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/extensions" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + _ "github.com/duckdb/duckdb-go/v2" + "github.com/paulmach/orb" + "github.com/paulmach/orb/encoding/wkb" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + hugr "github.com/hugr-lab/query-engine" + hugrclient "github.com/hugr-lab/query-engine/client" + "github.com/hugr-lab/query-engine/pkg/auth" + coredb "github.com/hugr-lab/query-engine/pkg/data-sources/sources/runtime/core-db" + "github.com/hugr-lab/query-engine/pkg/db" +) + +const ingestTestAPIKey = "ingest-test-api-key" + +// ingestEnv is per-test state on top of a shared hugr.Service (initialised +// once in TestMain). Each test owns a unique .duckdb file and a unique data +// source name, so tests don't share table state. Cleanup unloads the source +// to DETACH the file before t.TempDir() removes it. +type ingestEnv struct { + service *hugr.Service + server *httptest.Server + client *hugrclient.Client + dbPath string + dsName string // unique data source / catalog prefix, e.g. "duck_ingest_3" + dataObject string // dsName + ".events" +} + +// Shared service initialised once for the whole package — see TestMain. +// hugr.New + service.Init costs ~17s; doing it once cuts the package +// wall-clock from 13×17s ≈ 3.5min down to one-off ~17s + ~ms/test. +var ( + sharedService *hugr.Service + sharedServer *httptest.Server + sharedClient *hugrclient.Client + dsCounter atomic.Int64 +) + +func TestMain(m *testing.M) { + ctx := context.Background() + + service, err := hugr.New(hugr.Config{ + Debug: false, // shared service runs many tests — keep logs quiet + DB: db.Config{}, + CoreDB: coredb.New(coredb.Config{}), + Auth: &auth.Config{ + Providers: []auth.AuthProvider{ + auth.NewApiKey("ingest-test", auth.ApiKeyConfig{ + Key: ingestTestAPIKey, + DefaultRole: "admin", + }), + auth.NewAnonymous(auth.AnonymousConfig{ + Allowed: true, + Role: "admin", + }), + }, + }, + }) + if err != nil { + log.Fatalf("hugr.New: %v", err) + } + if err := service.Init(ctx); err != nil { + log.Fatalf("service.Init: %v", err) + } + sharedService = service + sharedServer = httptest.NewServer(service) + sharedClient = hugrclient.NewClient(sharedServer.URL + "/ipc") + + code := m.Run() + + sharedServer.Close() + _ = service.Close() + os.Exit(code) +} + +// openRO returns a fresh READ_ONLY sql.DB handle to the events database. +// DuckDB RO connections opened in the same process as a writer DO NOT +// transparently refresh snapshot across pooled connections, so we open a +// fresh handle per verification — this gives us a guaranteed post-write +// snapshot at the moment of the assertion. Callers should `defer Close()`. +func (e *ingestEnv) openRO(t *testing.T) *sql.DB { + t.Helper() + conn, err := sql.Open("duckdb", e.dbPath+"?access_mode=read_only") + require.NoError(t, err) + require.NoError(t, conn.PingContext(context.Background())) + return conn +} + +func setupEnv(t *testing.T) *ingestEnv { + t.Helper() + ctx := context.Background() + + n := dsCounter.Add(1) + dsName := fmt.Sprintf("duck_ingest_%d", n) + dbPath := filepath.Join(t.TempDir(), fmt.Sprintf("test_%d.duckdb", n)) + + // 1. Seed schema with a private writer; close before hugr opens it. + seed, err := sql.Open("duckdb", dbPath) + require.NoError(t, err) + _, err = seed.ExecContext(ctx, ` + INSTALL spatial; LOAD spatial; + CREATE SEQUENCE events_id_seq; + CREATE TABLE events ( + id BIGINT PRIMARY KEY DEFAULT nextval('events_id_seq'), + name VARCHAR NOT NULL, + value DOUBLE NOT NULL, + is_active BOOLEAN NOT NULL DEFAULT true, + owner_id BIGINT, + payload JSON, + payload_large_string JSON, + payload_string_view JSON, + payload_binary JSON, + payload_large_binary JSON, + payload_binary_view JSON, + payload_struct JSON, + payload_list JSON, + payload_large_list JSON, + payload_fixed_size_list JSON, + payload_list_view JSON, + payload_large_list_view JSON, + payload_map JSON, + payload_scalar JSON, + payload_arrow_json JSON, + payload_geo_point JSON, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + geom GEOMETRY, + geom_wkt GEOMETRY, + geom_geojson GEOMETRY, + geom_hugr_geojson GEOMETRY, + geom_plain_geojson GEOMETRY, + geom_geojson_struct GEOMETRY, + geom_geojson_arrow_json GEOMETRY, + geom_wkb GEOMETRY, + geom_hexwkb GEOMETRY, + geom_line GEOMETRY, + geom_polygon_native GEOMETRY, + geom_multipoint GEOMETRY, + geom_multiline GEOMETRY, + geom_multipolygon GEOMETRY + ); + `) + require.NoError(t, err) + require.NoError(t, seed.Close()) + + // 2. Schema path for the localFS catalog. + schemaDir, err := filepath.Abs(filepath.Join("testdata", "schemas", "duck_ingest")) + require.NoError(t, err) + require.DirExists(t, schemaDir) + + // 3. Register & load this test's unique data source on the SHARED service. + mustQuery(t, ctx, sharedService, `mutation($data: core_data_sources_mut_input_data!) { + core { insert_data_sources(data: $data) { name } } + }`, map[string]any{ + "data": map[string]any{ + "name": dsName, + "type": "duckdb", + "prefix": dsName, + "as_module": true, + "path": dbPath, + "catalogs": []map[string]any{{ + "name": dsName, + "type": "localFS", + "path": schemaDir, + }}, + }, + }) + mustQuery(t, ctx, sharedService, `mutation($name: String!) { + function { core { load_data_source(name: $name) { success message } } } + }`, map[string]any{"name": dsName}) + + env := &ingestEnv{ + service: sharedService, + server: sharedServer, + client: sharedClient, + dbPath: dbPath, + dsName: dsName, + dataObject: dsName + ".events", + } + + // Unload on test completion so DETACH releases the .duckdb file before + // t.TempDir() removes it. Best-effort: ignore errors (next test uses a + // different name + file, so a leak is harmless within a single run). + t.Cleanup(func() { + res, err := sharedService.Query(ctx, `mutation($name: String!, $hard: Boolean) { + function { core { unload_data_source(name: $name, hard: $hard) { success message } } } + }`, map[string]any{"name": dsName, "hard": true}) + if err == nil { + res.Close() + } + }) + + return env +} + +func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, vars map[string]any) { + t.Helper() + res, err := s.Query(ctx, q, vars) + require.NoError(t, err) + if res.Err() != nil { + require.NoErrorf(t, res.Err(), "graphql error for query: %s", q) + } + res.Close() +} + +func registerIngestPermissionRole(t *testing.T, service *hugr.Service, role, mutationModule string) { + t.Helper() + registerIngestPermissionRoleData(t, service, role, mutationModule, map[string]any{ + "owner_id": "[$auth.user_id_int]", + }) +} + +func registerIngestPermissionRoleData(t *testing.T, service *hugr.Service, role, mutationModule string, data map[string]any) { + t.Helper() + ctx := context.Background() + mustQuery(t, ctx, service, `mutation($role: core_roles_mut_input_data!, $allowAll: core_role_permissions_mut_input_data!, $inject: core_role_permissions_mut_input_data!) { + core { + insert_roles(data: $role) { name } + allow_all: insert_role_permissions(data: $allowAll) { role type_name field_name } + inject_owner: insert_role_permissions(data: $inject) { role type_name field_name } + } + }`, map[string]any{ + "role": map[string]any{ + "name": role, + "description": "IPC ingest permission data integration test role", + }, + "allowAll": map[string]any{ + "role": role, + "type_name": "*", + "field_name": "*", + }, + "inject": map[string]any{ + "role": role, + "type_name": mutationModule, + "field_name": "insert_events", + "data": data, + }, + }) +} + +func moduleMutationName(module string) string { + return "_module_" + strings.ReplaceAll(module, ".", "_") + "_mutation" +} + +func makeEventsRecord(t *testing.T, names []string, values []float64, active []bool, payload []string, created []arrow.Timestamp) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues(names, nil) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).AppendValues(values, nil) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).AppendValues(active, nil) + pBuilder := recordFieldBuilder(t, b, "payload").(*array.StringBuilder) + for _, p := range payload { + if p == "" { + pBuilder.AppendNull() + } else { + pBuilder.Append(p) + } + } + tsBuilder := recordFieldBuilder(t, b, "created_at").(*array.TimestampBuilder) + tsBuilder.AppendValues(created, nil) + return b.NewRecordBatch() +} + +func makeMalformedJSONRecord(t *testing.T, binary bool) arrow.RecordBatch { + t.Helper() + payloadType := arrow.DataType(arrow.BinaryTypes.String) + payloadName := "payload" + if binary { + payloadType = arrow.BinaryTypes.Binary + payloadName = "payload_binary" + } + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: payloadName, Type: payloadType, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) + defer b.Release() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("malformed-json") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(1) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(true) + payloadBuilder := recordFieldBuilder(t, b, payloadName) + if binary { + payloadBuilder.(*array.BinaryBuilder).Append([]byte(`{"unterminated":`)) + } else { + payloadBuilder.(*array.StringBuilder).Append(`{"unterminated":`) + } + return b.NewRecordBatch() +} + +func recordFieldBuilder(t *testing.T, b *array.RecordBuilder, name string) array.Builder { + t.Helper() + indices := b.Schema().FieldIndices(name) + require.Len(t, indices, 1, "arrow field %q must exist exactly once", name) + return b.Field(indices[0]) +} + +func mustRecordFieldBuilder(b *array.RecordBuilder, name string) array.Builder { + indices := b.Schema().FieldIndices(name) + if len(indices) != 1 { + panic(fmt.Sprintf("arrow field %q must exist exactly once", name)) + } + return b.Field(indices[0]) +} + +type eventsRecordBuilders struct { + names *array.StringBuilder + values *array.Float64Builder + active *array.BooleanBuilder + payloads *array.StringBuilder + createdAt *array.TimestampBuilder +} + +func eventsRecordBuildersFor(b *array.RecordBuilder) eventsRecordBuilders { + return eventsRecordBuilders{ + names: mustRecordFieldBuilder(b, "name").(*array.StringBuilder), + values: mustRecordFieldBuilder(b, "value").(*array.Float64Builder), + active: mustRecordFieldBuilder(b, "is_active").(*array.BooleanBuilder), + payloads: mustRecordFieldBuilder(b, "payload").(*array.StringBuilder), + createdAt: mustRecordFieldBuilder(b, "created_at").(*array.TimestampBuilder), + } +} + +type jsonPhysicalTypeSpec struct { + name string + dataType arrow.DataType + arrowExtension string + expected any + appendValue func(*testing.T, array.Builder) +} + +const ( + jsonStructKindField = iota + jsonStructCountField +) + +func jsonPhysicalTypeSpecs(t *testing.T) []jsonPhysicalTypeSpec { + t.Helper() + structType := arrow.StructOf( + arrow.Field{Name: "kind", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "count", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, + ) + geoPointType := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + arrowJSONType, err := extensions.NewJSONType(arrow.BinaryTypes.String) + require.NoError(t, err) + + return []jsonPhysicalTypeSpec{ + {name: "payload", dataType: arrow.BinaryTypes.String, expected: map[string]any{"kind": "string"}, appendValue: appendJSONText(`{"kind":"string"}`)}, + {name: "payload_large_string", dataType: arrow.BinaryTypes.LargeString, expected: map[string]any{"kind": "large_string"}, appendValue: appendJSONText(`{"kind":"large_string"}`)}, + {name: "payload_string_view", dataType: arrow.BinaryTypes.StringView, expected: map[string]any{"kind": "string_view"}, appendValue: appendJSONText(`{"kind":"string_view"}`)}, + {name: "payload_binary", dataType: arrow.BinaryTypes.Binary, expected: map[string]any{"kind": "binary"}, appendValue: appendJSONText(`{"kind":"binary"}`)}, + {name: "payload_large_binary", dataType: arrow.BinaryTypes.LargeBinary, expected: map[string]any{"kind": "large_binary"}, appendValue: appendJSONText(`{"kind":"large_binary"}`)}, + {name: "payload_binary_view", dataType: arrow.BinaryTypes.BinaryView, expected: map[string]any{"kind": "binary_view"}, appendValue: appendJSONText(`{"kind":"binary_view"}`)}, + {name: "payload_struct", dataType: structType, expected: map[string]any{"kind": "struct", "count": float64(14)}, appendValue: appendJSONStruct("struct", 14)}, + {name: "payload_list", dataType: arrow.ListOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(1), float64(2)}, appendValue: appendInt64JSONList(1, 2)}, + {name: "payload_large_list", dataType: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(3), float64(4)}, appendValue: appendInt64JSONList(3, 4)}, + {name: "payload_fixed_size_list", dataType: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), expected: []any{float64(5), float64(6)}, appendValue: appendInt64JSONList(5, 6)}, + {name: "payload_list_view", dataType: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(7), float64(8)}, appendValue: appendInt64JSONList(7, 8)}, + {name: "payload_large_list_view", dataType: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(9), float64(10)}, appendValue: appendInt64JSONList(9, 10)}, + {name: "payload_map", dataType: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), expected: map[string]any{"a": float64(11), "b": float64(12)}, appendValue: appendInt64JSONMap([]string{"a", "b"}, []int64{11, 12})}, + {name: "payload_scalar", dataType: arrow.PrimitiveTypes.Int64, expected: "13", appendValue: appendInt64JSONScalar(13)}, + {name: "payload_arrow_json", dataType: arrowJSONType, expected: map[string]any{"kind": "arrow_json"}, appendValue: appendArrowJSONText(`{"kind":"arrow_json"}`)}, + {name: "payload_geo_point", dataType: geoPointType, arrowExtension: "geoarrow.point", expected: geoJSONGeometry("Point", pointCoordinate(xyPoint{x: 30.5, y: 50.25})), appendValue: appendGeoArrowJSONPoint(xyPoint{x: 30.5, y: 50.25})}, + } +} + +func jsonPhysicalTypeColumns(t *testing.T) []string { + t.Helper() + specs := jsonPhysicalTypeSpecs(t) + columns := make([]string, 0, len(specs)) + for _, spec := range specs { + columns = append(columns, spec.name) + } + return columns +} + +func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + specs := jsonPhysicalTypeSpecs(t) + fields := []arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + } + for _, spec := range specs { + field := arrow.Field{Name: spec.name, Type: spec.dataType, Nullable: false} + if spec.arrowExtension != "" { + field.Metadata = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": spec.arrowExtension}) + } + fields = append(fields, field) + } + schema := arrow.NewSchema(fields, nil) + + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("json-physical-types") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(1) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(true) + for _, spec := range specs { + spec.appendValue(t, recordFieldBuilder(t, b, spec.name)) + } + return b.NewRecordBatch() +} + +func appendJSONText(value string) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + switch b := builder.(type) { + case *array.StringBuilder: + b.Append(value) + case *array.LargeStringBuilder: + b.Append(value) + case *array.StringViewBuilder: + b.Append(value) + case *array.BinaryBuilder: + b.Append([]byte(value)) + case *array.BinaryViewBuilder: + b.Append([]byte(value)) + default: + require.Failf(t, "unsupported JSON text builder", "got %T", builder) + } + } +} + +func appendJSONStruct(kind string, count int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + structBuilder, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + structBuilder.Append(true) + structBuilder.FieldBuilder(jsonStructKindField).(*array.StringBuilder).Append(kind) + structBuilder.FieldBuilder(jsonStructCountField).(*array.Int64Builder).Append(count) + } +} + +func appendInt64JSONList(values ...int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + switch b := builder.(type) { + case *array.ListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.LargeListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.FixedSizeListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.ListViewBuilder: + b.AppendWithSize(true, len(values)) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.LargeListViewBuilder: + b.AppendWithSize(true, len(values)) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + default: + require.Failf(t, "unsupported JSON list builder", "got %T", builder) + } + } +} + +func appendInt64JSONMap(keys []string, values []int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + mapBuilder, ok := builder.(*array.MapBuilder) + require.Truef(t, ok, "got %T, want *array.MapBuilder", builder) + mapBuilder.Append(true) + mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues(keys, nil) + mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues(values, nil) + } +} + +func appendInt64JSONScalar(value int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + intBuilder, ok := builder.(*array.Int64Builder) + require.Truef(t, ok, "got %T, want *array.Int64Builder", builder) + intBuilder.Append(value) + } +} + +func appendArrowJSONText(value string) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + extensionBuilder, ok := builder.(*array.ExtensionBuilder) + require.Truef(t, ok, "got %T, want *array.ExtensionBuilder", builder) + extensionBuilder.StorageBuilder().(*array.StringBuilder).Append(value) + } +} + +func appendGeoArrowJSONPoint(point xyPoint) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + structBuilder, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + appendPoint(structBuilder, point) + } +} + +func jsonPhysicalTypesExpected(t *testing.T) map[string]any { + t.Helper() + expected := map[string]any{"name": "json-physical-types"} + for _, spec := range jsonPhysicalTypeSpecs(t) { + expected[spec.name] = spec.expected + } + return expected +} + +func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, dsName string) { + t.Helper() + query := fmt.Sprintf(`{ + %s { + events(filter: {name: {eq: "json-physical-types"}}) { + name + %s + } + } + }`, dsName, strings.Join(jsonPhysicalTypeColumns(t), "\n")) + res, err := service.Query(context.Background(), query, nil) + require.NoError(t, err) + defer res.Close() + require.NoErrorf(t, res.Err(), "graphql error for query: %s", query) + + body, err := json.Marshal(res) + require.NoError(t, err) + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + data := payload["data"].(map[string]any) + root := data[dsName].(map[string]any) + rows := root["events"].([]any) + require.Len(t, rows, 1, "response: %s", string(body)) + assert.Equal(t, jsonPhysicalTypesExpected(t), rows[0]) +} + +// --- Core tests ----------------------------------------------------------- + +func TestIngest_DuckDB_RoundTrip(t *testing.T) { + env := setupEnv(t) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"alpha", "beta", "gamma"}, + []float64{1.5, 2.5, 3.5}, + []bool{true, false, true}, + []string{`{"k":"v"}`, "", `{"x":1}`}, + []arrow.Timestamp{now, now, now}, + ) + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, env.dataObject, res.DataObject) + assert.Equal(t, int64(3), res.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, res.Columns) + + // Verify via a fresh READ_ONLY verifier connection (independent of hugr's + // session). Open a new handle per verification to guarantee a post-write + // snapshot — see openRO doc. + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 3, count) + + ro2 := env.openRO(t) + defer ro2.Close() + rows, err := ro2.Query("SELECT name, value, is_active, payload IS NOT NULL FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + var ( + gotNames []string + gotValues []float64 + gotActive []bool + gotHasJSON []bool + ) + for rows.Next() { + var n string + var v float64 + var a, j bool + require.NoError(t, rows.Scan(&n, &v, &a, &j)) + gotNames = append(gotNames, n) + gotValues = append(gotValues, v) + gotActive = append(gotActive, a) + gotHasJSON = append(gotHasJSON, j) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"alpha", "beta", "gamma"}, gotNames) + assert.Equal(t, []float64{1.5, 2.5, 3.5}, gotValues) + assert.Equal(t, []bool{true, false, true}, gotActive) + assert.Equal(t, []bool{true, false, true}, gotHasJSON) +} + +func TestIngest_DuckDB_JSONPhysicalTypes(t *testing.T) { + env := setupEnv(t) + rec := makeJSONPhysicalTypesRecord(t) + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) + require.NoError(t, err) + assert.Equal(t, int64(1), res.Inserted) + expectedColumns := append([]string{"name", "value", "is_active"}, jsonPhysicalTypeColumns(t)...) + assert.ElementsMatch(t, expectedColumns, res.Columns) + assertJSONPhysicalTypesReadThroughHugr(t, env.service, env.dsName) +} + +func TestIngest_DuckDB_RejectsMalformedJSON(t *testing.T) { + for _, tt := range []struct { + name string + binary bool + }{ + {name: "string"}, + {name: "binary", binary: true}, + } { + t.Run(tt.name, func(t *testing.T) { + env := setupEnv(t) + rec := makeMalformedJSONRecord(t, tt.binary) + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) + require.Error(t, err) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Zero(t, count, "a failed JSON cast must roll back the entire ingest") + }) + } +} + +func TestIngest_DuckDB_PermissionData(t *testing.T) { + env := setupEnv(t) + + const ownerID = 4242 + role := "ingest_perm_" + env.dsName + registerIngestPermissionRole(t, env.service, role, moduleMutationName(env.dsName)) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"perm-alpha", "perm-beta"}, + []float64{11.5, 12.5}, + []bool{true, true}, + []string{"", ""}, + []arrow.Timestamp{now, now}, + ) + defer rec.Release() + + permClient := hugrclient.NewClient(env.server.URL+"/ipc", + hugrclient.WithApiKey(ingestTestAPIKey), + hugrclient.WithUserRole(role), + hugrclient.WithUserInfo(strconv.Itoa(ownerID), "permission-user"), + ) + res, err := permClient.IngestRecord(context.Background(), env.dataObject, rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + assert.NotContains(t, res.Columns, "owner_id", "owner_id must be injected by permissions, not sent in Arrow") + + ro := env.openRO(t) + defer ro.Close() + rows, err := ro.Query("SELECT name, owner_id FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]int64{} + for rows.Next() { + var ( + name string + ownerID int64 + ) + require.NoError(t, rows.Scan(&name, &ownerID)) + got[name] = ownerID + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]int64{ + "perm-alpha": ownerID, + "perm-beta": ownerID, + }, got) +} + +func TestIngest_DuckDB_PermissionDataGeometry(t *testing.T) { + env := setupEnv(t) + + role := "ingest_perm_geom_" + env.dsName + registerIngestPermissionRoleData(t, env.service, role, moduleMutationName(env.dsName), map[string]any{ + "geom": "POINT (7.25 8.5)", + }) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"perm-geom-alpha", "perm-geom-beta"}, + []float64{21.5, 22.5}, + []bool{true, true}, + []string{"", ""}, + []arrow.Timestamp{now, now}, + ) + defer rec.Release() + + permClient := hugrclient.NewClient(env.server.URL+"/ipc", + hugrclient.WithApiKey(ingestTestAPIKey), + hugrclient.WithUserRole(role), + hugrclient.WithUserInfo("7", "permission-geometry-user"), + ) + res, err := permClient.IngestRecord(context.Background(), env.dataObject, rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + assert.NotContains(t, res.Columns, "geom", "geom must be injected by permissions, not sent in Arrow") + + ro := env.openRO(t) + defer ro.Close() + _, err = ro.Exec("LOAD spatial") + require.NoError(t, err) + + rows, err := ro.Query("SELECT name, ST_AsText(geom) FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]string{} + for rows.Next() { + var name, geom string + require.NoError(t, rows.Scan(&name, &geom)) + got[name] = compactWKT(geom) + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]string{ + "perm-geom-alpha": "POINT(7.25 8.5)", + "perm-geom-beta": "POINT(7.25 8.5)", + }, got) +} + +func TestIngest_DuckDB_UnknownColumn(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "not_a_column", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues([]string{"x"}, nil) + recordFieldBuilder(t, b, "not_a_column").(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecordBatch() + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), env.dataObject, rec) + require.Error(t, err) + assert.Contains(t, err.Error(), "not_a_column") + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 0, count, "no rows should have been inserted on validation failure") +} + +func TestIngest_DuckDB_UnknownDataObject(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "x").(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecordBatch() + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), env.dsName+".does_not_exist", rec) + require.Error(t, err) +} + +func TestIngest_DuckDB_MultipleBatches(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + mk := func(names []string) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + fields := eventsRecordBuildersFor(b) + fields.names.AppendValues(names, nil) + vals := make([]float64, len(names)) + for i := range vals { + vals[i] = float64(i) + } + fields.values.AppendValues(vals, nil) + active := make([]bool, len(names)) + for i := range active { + active[i] = true + } + fields.active.AppendValues(active, nil) + fields.payloads.AppendNulls(len(names)) + ts := make([]arrow.Timestamp, len(names)) + for i := range ts { + ts[i] = arrow.Timestamp(time.Now().UTC().UnixMicro()) + } + fields.createdAt.AppendValues(ts, nil) + return b.NewRecordBatch() + } + rec1 := mk([]string{"a", "b"}) + defer rec1.Release() + rec2 := mk([]string{"c", "d", "e"}) + defer rec2.Release() + + rr, err := array.NewRecordReader(schema, []arrow.RecordBatch{rec1, rec2}) + require.NoError(t, err) + defer rr.Release() + + res, err := env.client.Ingest(context.Background(), env.dataObject, rr) + require.NoError(t, err) + assert.Equal(t, int64(5), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 5, count) +} + +// TestIngest_DuckDB_Bulk — 50k rows via the typed Go client + NewLazyReader +// (lazy generation, never materialised), with post-POST COUNT(*) timing +// check against a fresh READ_ONLY verifier. +func TestIngest_DuckDB_Bulk(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-bulk" + ) + + pool := memory.NewGoAllocator() + schema := eventsArrowSchema() + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + batchIdx := 0 + reader := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if batchIdx >= numBatches { + return nil, nil + } + rec := buildEventsBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix, base) + batchIdx++ + return rec, nil + }) + defer reader.Release() + + start := time.Now() + res, err := env.client.Ingest(context.Background(), env.dataObject, reader) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + // post-POST COUNT(*) through a fresh READ_ONLY connection — synchronicity. + ro := env.openRO(t) + defer ro.Close() + countStart := time.Now() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible the moment POST returns") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + // Spot-check first 5 rows by content. + ro2 := env.openRO(t) + defer ro2.Close() + rows, err := ro2.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) + require.NoError(t, err) + defer rows.Close() + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) + for rows.Next() { + var n string + var v float64 + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{ + namePrefix + "-000000", namePrefix + "-000001", namePrefix + "-000002", + namePrefix + "-000003", namePrefix + "-000004", + }, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + // row%5 == 0 ⇒ payload IS NULL; only row 0 in the first five. + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + ro3 := env.openRO(t) + defer ro3.Close() + var activeCount int + require.NoError(t, ro3.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + + t.Logf("bulk ingest via Go client: %d rows in %d batches in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_DuckDB_Stream — IngestStream happy path with a pre-serialised +// Arrow buffer. +func TestIngest_DuckDB_Stream(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues([]string{"s1", "s2"}, nil) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).AppendValues([]float64{10, 20}, nil) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).AppendValues([]bool{true, false}, nil) + rec := b.NewRecordBatch() + b.Release() + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + res, err := env.client.IngestStream(context.Background(), env.dataObject, &buf) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 2, count) +} + +func TestIngest_DuckDB_Stream_Empty(t *testing.T) { + env := setupEnv(t) + _, err := env.client.IngestStream(context.Background(), env.dataObject, nil) + require.Error(t, err) + assert.Contains(t, err.Error(), "body is nil") + + _, err = env.client.IngestStream(context.Background(), "", bytes.NewReader([]byte{})) + require.Error(t, err) + assert.Contains(t, err.Error(), "data_object") +} + +// TestIngest_DuckDB_ArrowIPCFile_StreamFormat — 50k×1000 stream-format file +// → IngestArrowIPCFile → byte-forwarded to /ipc/ingest. +func TestIngest_DuckDB_ArrowIPCFile_StreamFormat(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-fs" + ) + path := filepath.Join(t.TempDir(), "events_stream.arrows") + writeEventsArrowFile(t, path, namePrefix, arrowStreamFormat, numBatches, rowsPerBatch) + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.NotEqual(t, "ARROW1", string(head[:6]), "stream format must not start with ARROW1 magic") + + start := time.Now() + res, err := env.client.IngestArrowIPCFile(context.Background(), env.dataObject, path) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + assertArrowIPCFileGeometry(t, env, ro, namePrefix, totalRows) + + t.Logf("arrow ipc stream file ingest: %d rows from %d-batch file in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_DuckDB_ArrowIPCFile_FileFormat — 50k×1000 file-format (.arrow, +// ARROW1 magic + footer) → IngestArrowIPCFile detects magic, re-streams via +// ipc.FileReader. +func TestIngest_DuckDB_ArrowIPCFile_FileFormat(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-ff" + ) + path := filepath.Join(t.TempDir(), "events_file.arrow") + writeEventsArrowFile(t, path, namePrefix, arrowFileFmt, numBatches, rowsPerBatch) + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.Equal(t, "ARROW1", string(head[:6]), "file format must start with ARROW1 magic") + + start := time.Now() + res, err := env.client.IngestArrowIPCFile(context.Background(), env.dataObject, path) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + assertArrowIPCFileGeometry(t, env, ro, namePrefix, totalRows) + + t.Logf("arrow ipc file-format ingest: %d rows from %d-batch file in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +func assertArrowIPCFileGeometry(t *testing.T, env *ingestEnv, ro *sql.DB, namePrefix string, totalRows int) { + t.Helper() + _, err := ro.Exec("LOAD spatial") + require.NoError(t, err) + + lastName, lastPoint := geometryBatchRow(namePrefix, totalRows-1) + values := scanGeometryValues(t, ro.QueryRow(fmt.Sprintf(` + SELECT %s + FROM events + WHERE name = ? + `, geometrySelectList()), lastName)) + assert.Equal(t, geometryExpected(pointWKT(lastPoint), coord(lastPoint.x), coord(lastPoint.y)), values) + assertGeometryReadThroughHugr(t, env.service, env.dsName, fmt.Sprintf(`filter: { name: { eq: "%s" } }`, lastName), []map[string]any{ + geometryReadExpected(lastName, lastPoint, lastPoint.x, lastPoint.y), + }) +} + +func TestIngest_DuckDB_ArrowIPCFile_NotFound(t *testing.T) { + env := setupEnv(t) + _, err := env.client.IngestArrowIPCFile(context.Background(), env.dataObject, + filepath.Join(t.TempDir(), "does-not-exist.arrows")) + require.Error(t, err) +} + +// TestIngest_DuckDB_LazyReader — alias: bulk ingest via NewLazyReader, but +// keeping symmetry with PG suite name. Same scenario as Bulk above but with +// a distinct prefix so the suite can run all tests against a single setup +// without collisions if combined. +func TestIngest_DuckDB_LazyReader(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-lz" + ) + pool := memory.NewGoAllocator() + schema := eventsArrowSchema() + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + batchIdx := 0 + reader := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if batchIdx >= numBatches { + return nil, nil + } + rec := buildEventsBatch(pool, schema, batchIdx, rowsPerBatch, namePrefix, base) + batchIdx++ + return rec, nil + }) + defer reader.Release() + + start := time.Now() + res, err := env.client.Ingest(context.Background(), env.dataObject, reader) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + ro := env.openRO(t) + defer ro.Close() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + + t.Logf("lazy-reader bulk ingest: %d rows in %d batches in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_LazyReader_Termination_DuckDB — engine-agnostic unit-style test +// for NewLazyReader's termination semantics. Doesn't need the server, but +// mirrors the PG suite for full symmetry. +func TestIngest_LazyReader_Termination_DuckDB(t *testing.T) { + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + mk := func(v int32) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "x").(*array.Int32Builder).Append(v) + return b.NewRecordBatch() + } + + // gen returns batches then nil → clean end-of-stream. + { + i := 0 + r := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i >= 3 { + return nil, nil + } + i++ + return mk(int32(i)), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + require.NoError(t, r.Err()) + assert.Equal(t, 3, seen) + assert.False(t, r.Next(), "Next after end-of-stream stays false") + } + + // gen returns an error → Err() exposes it, stream terminates. + { + errBoom := errors.New("boom") + i := 0 + r := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i == 2 { + return nil, errBoom + } + i++ + return mk(int32(i)), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + assert.Equal(t, 2, seen, "yielded batches before failing call") + require.Error(t, r.Err()) + assert.ErrorIs(t, r.Err(), errBoom) + } +} + +// TestIngest_HTTP_Direct_DuckDB exercises low-level HTTP behaviour against +// /ipc/ingest (bad Content-Type, missing data_object, wrong method, invalid +// body) plus a real-world bulk path streamed through io.Pipe straight into +// the request body. Mirrors TestIngest_HTTP_Direct from the PG suite. +func TestIngest_HTTP_Direct_DuckDB(t *testing.T) { + env := setupEnv(t) + + // Missing data_object. + resp, err := http.Post(env.server.URL+"/ipc/ingest", "application/vnd.apache.arrow.stream", bytes.NewReader(nil)) + require.NoError(t, err) + b, _ := io.ReadAll(resp.Body) + resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) + + // Wrong method. + req, _ := http.NewRequest(http.MethodGet, env.server.URL+"/ipc/ingest?data_object="+env.dataObject, nil) + resp, err = http.DefaultClient.Do(req) + require.NoError(t, err) + resp.Body.Close() + assert.Equal(t, http.StatusMethodNotAllowed, resp.StatusCode) + + // Wrong content type. + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "text/plain", bytes.NewReader([]byte("hello"))) + require.NoError(t, err) + resp.Body.Close() + assert.Equal(t, http.StatusUnsupportedMediaType, resp.StatusCode) + + // Body is not a valid Arrow stream. + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", bytes.NewReader([]byte("not arrow"))) + require.NoError(t, err) + b, _ = io.ReadAll(resp.Body) + resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) + + // Happy path — single small record. + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + bld := array.NewRecordBuilder(pool, schema) + recordFieldBuilder(t, bld, "name").(*array.StringBuilder).AppendValues([]string{"direct"}, nil) + recordFieldBuilder(t, bld, "value").(*array.Float64Builder).AppendValues([]float64{42}, nil) + recordFieldBuilder(t, bld, "is_active").(*array.BooleanBuilder).AppendValues([]bool{true}, nil) + rec := bld.NewRecordBatch() + bld.Release() + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + var out hugrclient.IngestResult + require.NoError(t, json.NewDecoder(resp.Body).Decode(&out)) + resp.Body.Close() + assert.Equal(t, int64(1), out.Inserted) + + // --- Real-world bulk via io.Pipe streamed into the request body. + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-direct" + ) + bulkSchema := eventsArrowSchema() + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + w := ipc.NewWriter(pw, ipc.WithSchema(bulkSchema)) + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + var streamErr error + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + batchRec := buildEventsBatch(pool, bulkSchema, batchIdx, rowsPerBatch, namePrefix, base) + if werr := w.Write(batchRec); werr != nil { + streamErr = fmt.Errorf("write batch %d: %w", batchIdx, werr) + batchRec.Release() + break + } + batchRec.Release() + } + if cerr := w.Close(); cerr != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", cerr) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + start := time.Now() + bulkResp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", pr) + werr := <-writeErr + require.NoError(t, werr, "writer goroutine failed") + require.NoError(t, postErr) + require.Equal(t, http.StatusOK, bulkResp.StatusCode) + var bulkResult hugrclient.IngestResult + require.NoError(t, json.NewDecoder(bulkResp.Body).Decode(&bulkResult)) + bulkResp.Body.Close() + elapsed := time.Since(start) + assert.Equal(t, int64(totalRows), bulkResult.Inserted) + + ro := env.openRO(t) + defer ro.Close() + countStart := time.Now() + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'dk-direct-%'").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all dk-direct rows visible immediately after POST") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + t.Logf("bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +func TestIngest_HTTP_GeometryTypes_DuckDB(t *testing.T) { + env := setupEnv(t) + + rec, schema := makeGeometryTypesRecord(t, []geometryTypesRow{ + {name: "geo-a", value: 1, active: true, point: xyPoint{x: 30.5, y: 50.25}, shapeOrigin: xyPoint{x: 0, y: 0}}, + {name: "geo-b", value: 2, active: true, point: xyPoint{x: -73.935242, y: 40.730610}, shapeOrigin: xyPoint{x: 1, y: 1}}, + }) + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err := http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + var out hugrclient.IngestResult + require.NoError(t, json.Unmarshal(body, &out)) + assert.Equal(t, int64(2), out.Inserted) + assert.ElementsMatch(t, geometryTypesColumns(), out.Columns) + + ro := env.openRO(t) + defer ro.Close() + _, err = ro.Exec("LOAD spatial") + require.NoError(t, err) + + rows, err := ro.Query(fmt.Sprintf(` + SELECT name, + %s + FROM events + WHERE name LIKE 'geo-%%' + ORDER BY name + `, geometrySelectList())) + require.NoError(t, err) + defer rows.Close() + + got := map[string][]string{} + for rows.Next() { + name, values := scanNamedGeometryValues(t, rows) + got[name] = values + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string][]string{ + "geo-a": geometryExpected("POINT(30.5 50.25)", "0", "0"), + "geo-b": geometryExpected("POINT(-73.935242 40.73061)", "1", "1"), + }, got) +} + +func TestIngest_HTTP_GeometryTypes_ReadThroughHugr_DuckDB(t *testing.T) { + env := setupEnv(t) + + rec, schema := makeGeometryTypesRecord(t, []geometryTypesRow{ + {name: "geo-read-a", value: 1, active: true, point: xyPoint{x: 30.5, y: 50.25}, shapeOrigin: xyPoint{x: 0, y: 0}}, + {name: "geo-read-b", value: 2, active: true, point: xyPoint{x: -73.935242, y: 40.730610}, shapeOrigin: xyPoint{x: 1, y: 1}}, + }) + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err := http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { like: "geo-read-%" } }`, []map[string]any{ + geometryReadExpected("geo-read-a", xyPoint{x: 30.5, y: 50.25}, 0, 0), + geometryReadExpected("geo-read-b", xyPoint{x: -73.935242, y: 40.730610}, 1, 1), + }) +} + +func TestIngest_HTTP_GeometryTypes_Bulk50k_DuckDB(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "dk-geo-bulk" + ) + schema := geometryTypesSchema() + pool := memory.NewGoAllocator() + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + w := ipc.NewWriter(pw, ipc.WithSchema(schema)) + var streamErr error + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rec := buildGeometryTypesBatch(t, pool, schema, batchIdx, rowsPerBatch, namePrefix) + if err := w.Write(rec); err != nil { + streamErr = fmt.Errorf("write geometry batch %d: %w", batchIdx, err) + rec.Release() + break + } + rec.Release() + } + if err := w.Close(); err != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", err) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + start := time.Now() + resp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object="+env.dataObject, + "application/vnd.apache.arrow.stream", pr) + werr := <-writeErr + require.NoError(t, werr, "writer goroutine failed") + require.NoError(t, postErr) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + var out hugrclient.IngestResult + require.NoError(t, json.Unmarshal(body, &out)) + assert.Equal(t, int64(totalRows), out.Inserted) + + ro := env.openRO(t) + defer ro.Close() + _, err := ro.Exec("LOAD spatial") + require.NoError(t, err) + + var count int + require.NoError(t, ro.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'dk-geo-bulk-%'").Scan(&count)) + assert.Equal(t, totalRows, count) + + values := scanGeometryValues(t, ro.QueryRow(fmt.Sprintf(` + SELECT %s + FROM events + WHERE name = 'dk-geo-bulk-049999' + `, geometrySelectList()))) + assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) + assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { eq: "dk-geo-bulk-049999" } }`, []map[string]any{ + geometryReadExpected("dk-geo-bulk-049999", xyPoint{x: 99, y: 49}, 99, 49), + }) + + elapsed := time.Since(start) + t.Logf("geometry bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// --- helpers -------------------------------------------------------------- + +type arrowFileFormat int + +const ( + arrowStreamFormat arrowFileFormat = iota + arrowFileFmt +) + +func eventsArrowSchema() *arrow.Schema { + return arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) +} + +func eventsArrowFileSchema() *arrow.Schema { + fields := append([]arrow.Field{}, eventsArrowSchema().Fields()...) + fields = append(fields, geometryArrowFields()...) + return arrow.NewSchema(fields, nil) +} + +type geometryTypesRow struct { + name string + value float64 + active bool + point xyPoint + shapeOrigin xyPoint +} + +func makeGeometryTypesRecord(t *testing.T, rows []geometryTypesRow) (arrow.RecordBatch, *arrow.Schema) { + t.Helper() + + schema := geometryTypesSchema() + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + for _, row := range rows { + appendGeometryTypesRow(t, b, row) + } + + return b.NewRecordBatch(), schema +} + +func geometryTypesSchema() *arrow.Schema { + fields := []arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + } + fields = append(fields, geometryArrowFields()...) + return arrow.NewSchema(fields, nil) +} + +func geometryArrowFields() []arrow.Field { + pointType := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + lineType := arrow.ListOf(pointType) + polygonType := arrow.ListOf(lineType) + fields := make([]arrow.Field, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + field := arrow.Field{ + Name: col.name, + Type: col.arrowType, + Nullable: false, + } + if col.arrowExtension != "" { + field.Metadata = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": col.arrowExtension}) + } + fields = append(fields, field) + } + return fields +} + +type geometryValueColumn struct { + name string + arrowType arrow.DataType + arrowExtension string + expectedWKT func(point, x, y string) string +} + +func geometryValueColumns(pointType, lineType, polygonType arrow.DataType) []geometryValueColumn { + geoJSONStructType := arrow.StructOf( + arrow.Field{Name: "type", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "coordinates", Type: arrow.ListOf(arrow.ListOf(arrow.ListOf(arrow.PrimitiveTypes.Float64))), Nullable: false}, + ) + line := func(_ string, x string, y string) string { + return fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)) + } + polygon := func(_ string, x string, y string) string { return polygonWKT(x, y) } + point := func(point string, _ string, _ string) string { return point } + multiPoint := func(_ string, x string, y string) string { + return fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y) + } + multiLine := func(_ string, x string, y string) string { + return fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)) + } + multiPolygon := func(_ string, x string, y string) string { return multiPolygonWKT(x, y) } + + return []geometryValueColumn{ + {name: "geom", arrowType: pointType, arrowExtension: "geoarrow.point", expectedWKT: point}, + {name: "geom_wkt", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.wkt", expectedWKT: line}, + {name: "geom_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.geojson", expectedWKT: polygon}, + {name: "geom_hugr_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "hugr.geojson", expectedWKT: polygon}, + {name: "geom_plain_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "geojson", expectedWKT: polygon}, + {name: "geom_geojson_struct", arrowType: geoJSONStructType, expectedWKT: polygon}, + {name: "geom_geojson_arrow_json", arrowType: mustArrowJSONType(), arrowExtension: "arrow.json", expectedWKT: polygon}, + {name: "geom_wkb", arrowType: arrow.BinaryTypes.Binary, arrowExtension: "geoarrow.wkb", expectedWKT: point}, + {name: "geom_hexwkb", arrowType: arrow.BinaryTypes.String, arrowExtension: "hugr.hexwkb", expectedWKT: point}, + {name: "geom_line", arrowType: lineType, arrowExtension: "geoarrow.linestring", expectedWKT: line}, + {name: "geom_polygon_native", arrowType: polygonType, arrowExtension: "geoarrow.polygon", expectedWKT: polygon}, + {name: "geom_multipoint", arrowType: lineType, arrowExtension: "geoarrow.multipoint", expectedWKT: multiPoint}, + {name: "geom_multiline", arrowType: polygonType, arrowExtension: "geoarrow.multilinestring", expectedWKT: multiLine}, + {name: "geom_multipolygon", arrowType: arrow.ListOf(polygonType), arrowExtension: "geoarrow.multipolygon", expectedWKT: multiPolygon}, + } +} + +func mustArrowJSONType() arrow.DataType { + typ, err := extensions.NewJSONType(arrow.BinaryTypes.String) + if err != nil { + panic(err) + } + return typ +} + +func geometryTypesColumns() []string { + pointType, lineType, polygonType := geometryArrowTypes() + columns := []string{"name", "value", "is_active"} + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + columns = append(columns, col.name) + } + return columns +} + +func geometryExpected(point, x, y string) []string { + pointType, lineType, polygonType := geometryArrowTypes() + values := make([]string, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + values = append(values, col.expectedWKT(point, x, y)) + } + return values +} + +func geometryArrowTypes() (pointType, lineType, polygonType arrow.DataType) { + pointType = arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + lineType = arrow.ListOf(pointType) + polygonType = arrow.ListOf(lineType) + return pointType, lineType, polygonType +} + +func geometrySelectList() string { + pointType, lineType, polygonType := geometryArrowTypes() + exprs := make([]string, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + exprs = append(exprs, "ST_AsText("+col.name+")") + } + return strings.Join(exprs, ",\n") +} + +type sqlScanner interface { + Scan(dest ...any) error +} + +func scanGeometryValues(t *testing.T, scanner sqlScanner) []string { + t.Helper() + pointType, lineType, polygonType := geometryArrowTypes() + columns := geometryValueColumns(pointType, lineType, polygonType) + values := make([]string, len(columns)) + scanArgs := make([]any, 0, len(columns)) + for i := range columns { + scanArgs = append(scanArgs, &values[i]) + } + require.NoError(t, scanner.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + return values +} + +func scanNamedGeometryValues(t *testing.T, rows *sql.Rows) (string, []string) { + t.Helper() + pointType, lineType, polygonType := geometryArrowTypes() + columns := geometryValueColumns(pointType, lineType, polygonType) + var name string + values := make([]string, len(columns)) + scanArgs := []any{&name} + for i := range columns { + scanArgs = append(scanArgs, &values[i]) + } + require.NoError(t, rows.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + return name, values +} + +func polygonWKT(x, y string) string { + return fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s))", + x, y, + x, addCoord(y, 4), + addCoord(x, 4), addCoord(y, 4), + addCoord(x, 4), y, + x, y, + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 1), + ) +} + +func multiPolygonWKT(x, y string) string { + return fmt.Sprintf("MULTIPOLYGON(((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s)),((%s %s,%s %s,%s %s,%s %s,%s %s)))", + x, y, + x, addCoord(y, 4), + addCoord(x, 4), addCoord(y, 4), + addCoord(x, 4), y, + x, y, + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 10), addCoord(y, 10), + addCoord(x, 10), addCoord(y, 12), + addCoord(x, 12), addCoord(y, 12), + addCoord(x, 12), addCoord(y, 10), + addCoord(x, 10), addCoord(y, 10), + ) +} + +func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, filter string, expected []map[string]any) { + t.Helper() + + query := fmt.Sprintf(`{ + %s { + events(%s, order_by: [{field: "name", direction: ASC}]) { + name + geom + geom_wkt + geom_geojson + geom_hugr_geojson + geom_plain_geojson + geom_geojson_struct + geom_geojson_arrow_json + geom_wkb + geom_hexwkb + geom_line + geom_polygon_native + geom_multipoint + geom_multiline + geom_multipolygon + } + } + }`, dsName, filter) + + res, err := service.Query(context.Background(), query, nil) + require.NoError(t, err) + defer res.Close() + require.NoErrorf(t, res.Err(), "graphql error for query: %s", query) + + body, err := json.Marshal(res) + require.NoError(t, err) + + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + data, ok := payload["data"].(map[string]any) + require.True(t, ok, "response data must be an object: %s", string(body)) + root, ok := data[dsName].(map[string]any) + require.True(t, ok, "response data.%s must be an object: %s", dsName, string(body)) + rawRows, ok := root["events"].([]any) + require.True(t, ok, "response data.%s.events must be an array: %s", dsName, string(body)) + + got := make([]map[string]any, 0, len(rawRows)) + for _, raw := range rawRows { + row, ok := raw.(map[string]any) + require.True(t, ok, "event row must be an object: %#v", raw) + got = append(got, row) + } + assert.Equal(t, expected, got) +} + +func geometryReadExpected(name string, point xyPoint, x, y float64) map[string]any { + return map[string]any{ + "name": name, + "geom": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_geojson_struct": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_geojson_arrow_json": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_wkb": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_hexwkb": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_multipoint": geoJSONGeometry("MultiPoint", pointCoordinates(multiPoints(x, y))), + "geom_multiline": geoJSONGeometry("MultiLineString", nestedPointCoordinates(multiLines(x, y))), + "geom_multipolygon": geoJSONGeometry("MultiPolygon", deepPointCoordinates(multiPolygons(x, y))), + } +} + +func geoJSONGeometry(typ string, coordinates any) map[string]any { + return map[string]any{ + "type": typ, + "coordinates": coordinates, + } +} + +func pointCoordinate(point xyPoint) []any { + return []any{point.x, point.y} +} + +func pointCoordinates(points []xyPoint) []any { + coords := make([]any, 0, len(points)) + for _, point := range points { + coords = append(coords, pointCoordinate(point)) + } + return coords +} + +func nestedPointCoordinates(lines [][]xyPoint) []any { + coords := make([]any, 0, len(lines)) + for _, line := range lines { + coords = append(coords, pointCoordinates(line)) + } + return coords +} + +func deepPointCoordinates(polygons [][][]xyPoint) []any { + coords := make([]any, 0, len(polygons)) + for _, polygon := range polygons { + coords = append(coords, nestedPointCoordinates(polygon)) + } + return coords +} + +func addCoord(v string, delta float64) string { + f, err := strconv.ParseFloat(v, 64) + if err != nil { + panic(err) + } + return coord(f + delta) +} + +func compactWKT(s string) string { + s = strings.ReplaceAll(s, ", ", ",") + s = strings.ReplaceAll(s, " (", "(") + if strings.HasPrefix(s, "MULTIPOINT((") && strings.HasSuffix(s, "))") { + inner := strings.TrimSuffix(strings.TrimPrefix(s, "MULTIPOINT(("), "))") + s = "MULTIPOINT(" + strings.ReplaceAll(inner, "),(", ",") + ")" + } + return s +} + +func buildGeometryTypesBatch(t *testing.T, pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { + t.Helper() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + name, point := geometryBatchRow(namePrefix, row) + appendGeometryTypesRow(t, b, geometryTypesRow{ + name: name, + value: float64(row) * 0.5, + active: row%2 == 0, + point: point, + shapeOrigin: point, + }) + } + return b.NewRecordBatch() +} + +func geometryBatchRow(namePrefix string, row int) (string, xyPoint) { + return fmt.Sprintf("%s-%06d", namePrefix, row), xyPoint{ + x: float64(row % 100), + y: float64(row / 1000), + } +} + +func appendGeometryTypesRow(t *testing.T, b *array.RecordBuilder, row geometryTypesRow) { + t.Helper() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append(row.name) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(row.value) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(row.active) + appendGeometryValueFields(t, b, row) +} + +func appendGeometryValueFields(t *testing.T, b *array.RecordBuilder, row geometryTypesRow) { + t.Helper() + x, y := row.shapeOrigin.x, row.shapeOrigin.y + + appendPoint(recordFieldBuilder(t, b, "geom").(*array.StructBuilder), row.point) + recordFieldBuilder(t, b, "geom_wkt").(*array.StringBuilder).Append(lineWKT(x, y)) + recordFieldBuilder(t, b, "geom_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + recordFieldBuilder(t, b, "geom_hugr_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + recordFieldBuilder(t, b, "geom_plain_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + appendGeoJSONPolygonStruct(t, recordFieldBuilder(t, b, "geom_geojson_struct"), x, y) + recordFieldBuilder(t, b, "geom_geojson_arrow_json").(*array.ExtensionBuilder).StorageBuilder().(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + + wkbPoint, err := wkb.Marshal(orb.Point{row.point.x, row.point.y}) + require.NoError(t, err) + recordFieldBuilder(t, b, "geom_wkb").(*array.BinaryBuilder).Append(wkbPoint) + recordFieldBuilder(t, b, "geom_hexwkb").(*array.StringBuilder).Append(strings.ToUpper(hex.EncodeToString(wkbPoint))) + appendPointList(recordFieldBuilder(t, b, "geom_line").(*array.ListBuilder), linePoints(x, y)) + appendPointListList(recordFieldBuilder(t, b, "geom_polygon_native").(*array.ListBuilder), polygonRings(x, y)) + appendPointList(recordFieldBuilder(t, b, "geom_multipoint").(*array.ListBuilder), multiPoints(x, y)) + appendPointListList(recordFieldBuilder(t, b, "geom_multiline").(*array.ListBuilder), multiLines(x, y)) + appendPointListListList(recordFieldBuilder(t, b, "geom_multipolygon").(*array.ListBuilder), multiPolygons(x, y)) +} + +type xyPoint struct { + x float64 + y float64 +} + +const ( + geoArrowPointXField = iota + geoArrowPointYField +) + +const ( + geoJSONGeometryTypeField = iota + geoJSONGeometryCoordinatesField +) + +func appendPoint(sb *array.StructBuilder, point xyPoint) { + sb.Append(true) + sb.FieldBuilder(geoArrowPointXField).(*array.Float64Builder).Append(point.x) + sb.FieldBuilder(geoArrowPointYField).(*array.Float64Builder).Append(point.y) +} + +func appendGeoJSONPolygonStruct(t *testing.T, builder array.Builder, x, y float64) { + t.Helper() + sb, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + + sb.Append(true) + sb.FieldBuilder(geoJSONGeometryTypeField).(*array.StringBuilder).Append("Polygon") + appendGeoJSONPolygonCoordinates(sb.FieldBuilder(geoJSONGeometryCoordinatesField).(*array.ListBuilder), polygonRings(x, y)) +} + +func appendGeoJSONPolygonCoordinates(lb *array.ListBuilder, rings [][]xyPoint) { + lb.Append(true) + ringBuilder := lb.ValueBuilder().(*array.ListBuilder) + for _, ring := range rings { + ringBuilder.Append(true) + pointBuilder := ringBuilder.ValueBuilder().(*array.ListBuilder) + for _, point := range ring { + pointBuilder.Append(true) + pointBuilder.ValueBuilder().(*array.Float64Builder).AppendValues([]float64{point.x, point.y}, nil) + } + } +} + +func appendPointList(lb *array.ListBuilder, points []xyPoint) { + lb.Append(true) + sb := lb.ValueBuilder().(*array.StructBuilder) + for _, point := range points { + appendPoint(sb, point) + } +} + +func appendPointListList(lb *array.ListBuilder, lines [][]xyPoint) { + lb.Append(true) + inner := lb.ValueBuilder().(*array.ListBuilder) + for _, points := range lines { + appendPointList(inner, points) + } +} + +func appendPointListListList(lb *array.ListBuilder, polygons [][][]xyPoint) { + lb.Append(true) + inner := lb.ValueBuilder().(*array.ListBuilder) + for _, rings := range polygons { + appendPointListList(inner, rings) + } +} + +func linePoints(x, y float64) []xyPoint { + return []xyPoint{{x: x, y: y}, {x: x + 1, y: y + 1}, {x: x + 2, y: y + 1}} +} + +func polygonRings(x, y float64) [][]xyPoint { + return [][]xyPoint{ + {{x: x, y: y}, {x: x, y: y + 4}, {x: x + 4, y: y + 4}, {x: x + 4, y: y}, {x: x, y: y}}, + {{x: x + 1, y: y + 1}, {x: x + 2, y: y + 1}, {x: x + 2, y: y + 2}, {x: x + 1, y: y + 2}, {x: x + 1, y: y + 1}}, + } +} + +func multiPoints(x, y float64) []xyPoint { + return []xyPoint{{x: x, y: y}, {x: x + 1, y: y + 1}, {x: x + 2, y: y}} +} + +func multiLines(x, y float64) [][]xyPoint { + return [][]xyPoint{ + {{x: x, y: y}, {x: x + 1, y: y + 1}}, + {{x: x + 2, y: y + 2}, {x: x + 3, y: y + 3}}, + } +} + +func multiPolygons(x, y float64) [][][]xyPoint { + return [][][]xyPoint{ + polygonRings(x, y), + {{{x: x + 10, y: y + 10}, {x: x + 10, y: y + 12}, {x: x + 12, y: y + 12}, {x: x + 12, y: y + 10}, {x: x + 10, y: y + 10}}}, + } +} + +func lineWKT(x, y float64) string { + return fmt.Sprintf("LINESTRING (%s %s, %s %s, %s %s)", + coord(x), coord(y), + coord(x+1), coord(y+1), + coord(x+2), coord(y+1)) +} + +func pointWKT(point xyPoint) string { + return fmt.Sprintf("POINT(%s %s)", coord(point.x), coord(point.y)) +} + +func polygonGeoJSON(x, y float64) string { + return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]],[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, + coord(x), coord(y), + coord(x), coord(y+4), + coord(x+4), coord(y+4), + coord(x+4), coord(y), + coord(x), coord(y), + coord(x+1), coord(y+1), + coord(x+2), coord(y+1), + coord(x+2), coord(y+2), + coord(x+1), coord(y+2), + coord(x+1), coord(y+1)) +} + +func coord(v float64) string { + return strconv.FormatFloat(v, 'f', -1, 64) +} + +// buildEventsBatch produces one RecordBatch of `rowsPerBatch` rows for the +// events schema. Row payload pattern matches the PG bulk fixtures so the +// spot-check assertions are reusable. +func buildEventsBatch(pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string, base time.Time) arrow.RecordBatch { + rb := array.NewRecordBuilder(pool, schema) + defer rb.Release() + fields := eventsRecordBuildersFor(rb) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + fields.names.Append(fmt.Sprintf("%s-%06d", namePrefix, row)) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) + if row%5 == 0 { + fields.payloads.AppendNull() + } else { + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + return rb.NewRecordBatch() +} + +type arrowIPCRecordWriter interface { + Write(arrow.RecordBatch) error + Close() error +} + +func newArrowIPCRecordWriter(t *testing.T, f *os.File, schema *arrow.Schema, format arrowFileFormat) arrowIPCRecordWriter { + t.Helper() + + switch format { + case arrowStreamFormat: + return ipc.NewWriter(f, ipc.WithSchema(schema)) + case arrowFileFmt: + w, err := ipc.NewFileWriter(f, ipc.WithSchema(schema)) + require.NoError(t, err) + return w + default: + t.Fatalf("unknown arrow file format: %d", format) + return nil + } +} + +func writeArrowIPCFile(t *testing.T, path string, schema *arrow.Schema, format arrowFileFormat, numBatches int, buildBatch func(batchIdx int) arrow.RecordBatch) { + t.Helper() + + f, err := os.Create(path) + require.NoError(t, err) + defer f.Close() + + w := newArrowIPCRecordWriter(t, f, schema, format) + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rec := buildBatch(batchIdx) + require.NoError(t, w.Write(rec)) + rec.Release() + } + require.NoError(t, w.Close()) +} + +// writeEventsArrowFile writes an Arrow IPC file (stream or file format) at +// path with `numBatches * rowsPerBatch` rows for the events schema. +func writeEventsArrowFile(t *testing.T, path, namePrefix string, format arrowFileFormat, numBatches, rowsPerBatch int) { + t.Helper() + pool := memory.NewGoAllocator() + schema := eventsArrowFileSchema() + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + writeArrowIPCFile(t, path, schema, format, numBatches, func(batchIdx int) arrow.RecordBatch { + rb := array.NewRecordBuilder(pool, schema) + defer rb.Release() + fields := eventsRecordBuildersFor(rb) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + name, point := geometryBatchRow(namePrefix, row) + fields.names.Append(name) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) + if row%5 == 0 { + fields.payloads.AppendNull() + } else { + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + appendGeometryValueFields(t, rb, geometryTypesRow{point: point, shapeOrigin: point}) + } + return rb.NewRecordBatch() + }) +} + +// Silence "imported and not used" if a refactor leaves a quoted ref around. +var _ atomic.Int64 diff --git a/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql new file mode 100644 index 00000000..31394d6d --- /dev/null +++ b/integration-test/ingest-duckdb/testdata/schemas/duck_ingest/schema.graphql @@ -0,0 +1,38 @@ +type events @table(name: "events") { + id: BigInt! @pk @default(sequence: "events_id_seq") + name: String! + value: Float! + is_active: Boolean! @default(value: "true") + owner_id: BigInt + payload: JSON + payload_large_string: JSON + payload_string_view: JSON + payload_binary: JSON + payload_large_binary: JSON + payload_binary_view: JSON + payload_struct: JSON + payload_list: JSON + payload_large_list: JSON + payload_fixed_size_list: JSON + payload_list_view: JSON + payload_large_list_view: JSON + payload_map: JSON + payload_scalar: JSON + payload_arrow_json: JSON + payload_geo_point: JSON + created_at: Timestamp @default(value: "now()") + geom: Geometry @geometry_info(srid: 4326, type: POINT) + geom_wkt: Geometry @geometry_info(srid: 4326, type: LINESTRING) + geom_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_hugr_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_plain_geojson: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_geojson_struct: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_geojson_arrow_json: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_wkb: Geometry @geometry_info(srid: 4326, type: POINT) + geom_hexwkb: Geometry @geometry_info(srid: 4326, type: POINT) + geom_line: Geometry @geometry_info(srid: 4326, type: LINESTRING) + geom_polygon_native: Geometry @geometry_info(srid: 4326, type: POLYGON) + geom_multipoint: Geometry @geometry_info(srid: 4326, type: MULTIPOINT) + geom_multiline: Geometry @geometry_info(srid: 4326, type: MULTILINESTRING) + geom_multipolygon: Geometry @geometry_info(srid: 4326, type: MULTIPOLYGON) +} diff --git a/integration-test/ingest-postgres/docker-compose.yml b/integration-test/ingest-postgres/docker-compose.yml new file mode 100644 index 00000000..dd148be5 --- /dev/null +++ b/integration-test/ingest-postgres/docker-compose.yml @@ -0,0 +1,21 @@ +services: + postgres: + image: postgis/postgis:16-3.4 + command: + - postgres + - -c + - logging_collector=on + - -c + - log_statement=all + environment: + POSTGRES_DB: ingestdb + POSTGRES_USER: test + POSTGRES_PASSWORD: test + ports: ["5437:5432"] + volumes: + - ./testdata/init.sql:/docker-entrypoint-initdb.d/01-init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U test -d ingestdb"] + interval: 2s + timeout: 5s + retries: 15 diff --git a/integration-test/ingest-postgres/ingest_postgres_test.go b/integration-test/ingest-postgres/ingest_postgres_test.go new file mode 100644 index 00000000..7f6f0aa7 --- /dev/null +++ b/integration-test/ingest-postgres/ingest_postgres_test.go @@ -0,0 +1,2432 @@ +//go:build duckdb_arrow + +package ingest_postgres_test + +import ( + "bytes" + "context" + "database/sql" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strconv" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/extensions" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + _ "github.com/jackc/pgx/v5/stdlib" + "github.com/paulmach/orb" + "github.com/paulmach/orb/encoding/wkb" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + hugr "github.com/hugr-lab/query-engine" + hugrclient "github.com/hugr-lab/query-engine/client" + "github.com/hugr-lab/query-engine/pkg/auth" + coredb "github.com/hugr-lab/query-engine/pkg/data-sources/sources/runtime/core-db" + "github.com/hugr-lab/query-engine/pkg/db" +) + +const ( + envPostgresDSN = "INGEST_POSTGRES_DSN" + envSchemasPath = "HUGR_INGEST_SCHEMAS_PATH" + ingestTestAPIKey = "ingest-test-api-key" +) + +// ingestEnv is per-test view on top of a shared hugr.Service (initialised +// once in TestMain). hugr.New + service.Init costs ~17s; doing it once cuts +// the package wall-clock from N×17s down to a one-off ~17s + ~ms/test. +type ingestEnv struct { + service *hugr.Service + server *httptest.Server + pgConn *sql.DB + client *hugrclient.Client + dsName string +} + +// Shared state — set up in TestMain when the postgres DSN env var is present. +// Tests Skip when sharedService is nil (DSN not configured). +var ( + sharedService *hugr.Service + sharedServer *httptest.Server + sharedPgConn *sql.DB + sharedClient *hugrclient.Client +) + +func TestMain(m *testing.M) { + dsn := os.Getenv(envPostgresDSN) + if dsn == "" { + // No DSN configured — let tests Skip individually with a friendly + // message. Don't fail the package. + os.Exit(m.Run()) + } + + schemasPath := os.Getenv(envSchemasPath) + if schemasPath == "" { + schemasPath = filepath.Join("testdata", "schemas") + } + abs, err := filepath.Abs(schemasPath) + if err != nil { + log.Fatalf("resolve schemas path: %v", err) + } + if _, err := os.Stat(filepath.Join(abs, "pg_ingest")); err != nil { + log.Fatalf("schemas/pg_ingest dir not found at %s: %v", abs, err) + } + + ctx := context.Background() + + service, err := hugr.New(hugr.Config{ + Debug: false, // shared service runs many tests — keep logs quiet + DB: db.Config{}, + CoreDB: coredb.New(coredb.Config{}), + Auth: &auth.Config{ + Providers: []auth.AuthProvider{ + auth.NewApiKey("ingest-test", auth.ApiKeyConfig{ + Key: ingestTestAPIKey, + DefaultRole: "admin", + }), + auth.NewAnonymous(auth.AnonymousConfig{ + Allowed: true, + Role: "admin", + }), + }, + }, + }) + if err != nil { + log.Fatalf("hugr.New: %v", err) + } + if err := service.Init(ctx); err != nil { + log.Fatalf("service.Init: %v", err) + } + + // Register & load the postgres data source pointed at the test database. + regRes, err := service.Query(ctx, `mutation($data: core_data_sources_mut_input_data!) { + core { insert_data_sources(data: $data) { name } } + }`, map[string]any{ + "data": map[string]any{ + "name": "pg_ingest", + "type": "postgres", + "prefix": "pg_ingest", + "as_module": true, + "path": dsn, + "catalogs": []map[string]any{{ + "name": "pg_ingest", + "type": "localFS", + "path": filepath.Join(abs, "pg_ingest"), + }}, + }, + }) + if err != nil { + log.Fatalf("register pg_ingest: %v", err) + } + if regRes.Err() != nil { + log.Fatalf("register pg_ingest graphql error: %v", regRes.Err()) + } + regRes.Close() + + loadRes, err := service.Query(ctx, `mutation { function { core { load_data_source(name: "pg_ingest") { success message } } } }`, nil) + if err != nil { + log.Fatalf("load pg_ingest: %v", err) + } + if loadRes.Err() != nil { + log.Fatalf("load pg_ingest graphql error: %v", loadRes.Err()) + } + loadRes.Close() + + srv := httptest.NewServer(service) + + pgConn, err := sql.Open("pgx", dsn) + if err != nil { + log.Fatalf("open pg verifier conn: %v", err) + } + if err := pgConn.PingContext(ctx); err != nil { + log.Fatalf("ping pg verifier conn: %v", err) + } + + sharedService = service + sharedServer = srv + sharedPgConn = pgConn + sharedClient = hugrclient.NewClient(srv.URL + "/ipc") + + code := m.Run() + + _ = pgConn.Close() + srv.Close() + _ = service.Close() + os.Exit(code) +} + +func setupEnv(t *testing.T) *ingestEnv { + t.Helper() + if sharedService == nil { + t.Skipf("%s not set — run integration-test/ingest-postgres/run.sh to spin up a postgres container", envPostgresDSN) + } + + // Truncate before each test to guarantee determinism. + _, err := sharedPgConn.ExecContext(context.Background(), "TRUNCATE TABLE events, binary_events RESTART IDENTITY") + require.NoError(t, err) + + return &ingestEnv{ + service: sharedService, + server: sharedServer, + pgConn: sharedPgConn, + client: sharedClient, + dsName: "pg_ingest", + } +} + +func mustQuery(t *testing.T, ctx context.Context, s *hugr.Service, q string, vars map[string]any) { + t.Helper() + res, err := s.Query(ctx, q, vars) + require.NoError(t, err) + if res.Err() != nil { + require.NoErrorf(t, res.Err(), "graphql error for query: %s", q) + } + res.Close() +} + +func registerIngestPermissionRole(t *testing.T, service *hugr.Service, role, mutationModule string) { + t.Helper() + registerIngestPermissionRoleData(t, service, role, mutationModule, map[string]any{ + "owner_id": "[$auth.user_id_int]", + }) +} + +func registerIngestPermissionRoleData(t *testing.T, service *hugr.Service, role, mutationModule string, data map[string]any) { + t.Helper() + ctx := context.Background() + mustQuery(t, ctx, service, `mutation($role: core_roles_mut_input_data!, $allowAll: core_role_permissions_mut_input_data!, $inject: core_role_permissions_mut_input_data!) { + core { + insert_roles(data: $role) { name } + allow_all: insert_role_permissions(data: $allowAll) { role type_name field_name } + inject_owner: insert_role_permissions(data: $inject) { role type_name field_name } + } + }`, map[string]any{ + "role": map[string]any{ + "name": role, + "description": "IPC ingest permission data integration test role", + }, + "allowAll": map[string]any{ + "role": role, + "type_name": "*", + "field_name": "*", + }, + "inject": map[string]any{ + "role": role, + "type_name": mutationModule, + "field_name": "insert_events", + "data": data, + }, + }) +} + +func moduleMutationName(module string) string { + return "_module_" + strings.ReplaceAll(module, ".", "_") + "_mutation" +} + +// makeEventsRecord builds a single Arrow RecordBatch with the columns of the +// pg_ingest.events table (excluding id, which is autogen). +func makeEventsRecord(t *testing.T, names []string, values []float64, active []bool, payload []string, created []arrow.Timestamp) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues(names, nil) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).AppendValues(values, nil) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).AppendValues(active, nil) + pBuilder := recordFieldBuilder(t, b, "payload").(*array.StringBuilder) + for _, p := range payload { + if p == "" { + pBuilder.AppendNull() + } else { + pBuilder.Append(p) + } + } + tsBuilder := recordFieldBuilder(t, b, "created_at").(*array.TimestampBuilder) + tsBuilder.AppendValues(created, nil) + return b.NewRecordBatch() +} + +func makeMalformedJSONRecord(t *testing.T, binary bool) arrow.RecordBatch { + t.Helper() + payloadType := arrow.DataType(arrow.BinaryTypes.String) + payloadName := "payload" + if binary { + payloadType = arrow.BinaryTypes.Binary + payloadName = "payload_binary" + } + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: payloadName, Type: payloadType, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) + defer b.Release() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("malformed-json") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(1) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(true) + payloadBuilder := recordFieldBuilder(t, b, payloadName) + if binary { + payloadBuilder.(*array.BinaryBuilder).Append([]byte(`{"unterminated":`)) + } else { + payloadBuilder.(*array.StringBuilder).Append(`{"unterminated":`) + } + return b.NewRecordBatch() +} + +func recordFieldBuilder(t *testing.T, b *array.RecordBuilder, name string) array.Builder { + t.Helper() + indices := b.Schema().FieldIndices(name) + require.Len(t, indices, 1, "arrow field %q must exist exactly once", name) + return b.Field(indices[0]) +} + +func mustRecordFieldBuilder(b *array.RecordBuilder, name string) array.Builder { + indices := b.Schema().FieldIndices(name) + if len(indices) != 1 { + panic(fmt.Sprintf("arrow field %q must exist exactly once", name)) + } + return b.Field(indices[0]) +} + +type eventsRecordBuilders struct { + names *array.StringBuilder + values *array.Float64Builder + active *array.BooleanBuilder + payloads *array.StringBuilder + createdAt *array.TimestampBuilder +} + +func eventsRecordBuildersFor(b *array.RecordBuilder) eventsRecordBuilders { + return eventsRecordBuilders{ + names: mustRecordFieldBuilder(b, "name").(*array.StringBuilder), + values: mustRecordFieldBuilder(b, "value").(*array.Float64Builder), + active: mustRecordFieldBuilder(b, "is_active").(*array.BooleanBuilder), + payloads: mustRecordFieldBuilder(b, "payload").(*array.StringBuilder), + createdAt: mustRecordFieldBuilder(b, "created_at").(*array.TimestampBuilder), + } +} + +type jsonPhysicalTypeSpec struct { + name string + dataType arrow.DataType + arrowExtension string + expected any + appendValue func(*testing.T, array.Builder) +} + +const ( + jsonStructKindField = iota + jsonStructCountField +) + +func jsonPhysicalTypeSpecs(t *testing.T) []jsonPhysicalTypeSpec { + t.Helper() + structType := arrow.StructOf( + arrow.Field{Name: "kind", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "count", Type: arrow.PrimitiveTypes.Int64, Nullable: false}, + ) + geoPointType := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + arrowJSONType, err := extensions.NewJSONType(arrow.BinaryTypes.String) + require.NoError(t, err) + + return []jsonPhysicalTypeSpec{ + {name: "payload", dataType: arrow.BinaryTypes.String, expected: map[string]any{"kind": "string"}, appendValue: appendJSONText(`{"kind":"string"}`)}, + {name: "payload_large_string", dataType: arrow.BinaryTypes.LargeString, expected: map[string]any{"kind": "large_string"}, appendValue: appendJSONText(`{"kind":"large_string"}`)}, + {name: "payload_string_view", dataType: arrow.BinaryTypes.StringView, expected: map[string]any{"kind": "string_view"}, appendValue: appendJSONText(`{"kind":"string_view"}`)}, + {name: "payload_binary", dataType: arrow.BinaryTypes.Binary, expected: map[string]any{"kind": "binary"}, appendValue: appendJSONText(`{"kind":"binary"}`)}, + {name: "payload_large_binary", dataType: arrow.BinaryTypes.LargeBinary, expected: map[string]any{"kind": "large_binary"}, appendValue: appendJSONText(`{"kind":"large_binary"}`)}, + {name: "payload_binary_view", dataType: arrow.BinaryTypes.BinaryView, expected: map[string]any{"kind": "binary_view"}, appendValue: appendJSONText(`{"kind":"binary_view"}`)}, + {name: "payload_struct", dataType: structType, expected: map[string]any{"kind": "struct", "count": float64(14)}, appendValue: appendJSONStruct("struct", 14)}, + {name: "payload_list", dataType: arrow.ListOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(1), float64(2)}, appendValue: appendInt64JSONList(1, 2)}, + {name: "payload_large_list", dataType: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(3), float64(4)}, appendValue: appendInt64JSONList(3, 4)}, + {name: "payload_fixed_size_list", dataType: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), expected: []any{float64(5), float64(6)}, appendValue: appendInt64JSONList(5, 6)}, + {name: "payload_list_view", dataType: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(7), float64(8)}, appendValue: appendInt64JSONList(7, 8)}, + {name: "payload_large_list_view", dataType: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), expected: []any{float64(9), float64(10)}, appendValue: appendInt64JSONList(9, 10)}, + {name: "payload_map", dataType: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), expected: map[string]any{"a": float64(11), "b": float64(12)}, appendValue: appendInt64JSONMap([]string{"a", "b"}, []int64{11, 12})}, + {name: "payload_scalar", dataType: arrow.PrimitiveTypes.Int64, expected: "13", appendValue: appendInt64JSONScalar(13)}, + {name: "payload_arrow_json", dataType: arrowJSONType, expected: map[string]any{"kind": "arrow_json"}, appendValue: appendArrowJSONText(`{"kind":"arrow_json"}`)}, + {name: "payload_geo_point", dataType: geoPointType, arrowExtension: "geoarrow.point", expected: geoJSONGeometry("Point", pointCoordinate(xyPoint{x: 30.5, y: 50.25})), appendValue: appendGeoArrowJSONPoint(xyPoint{x: 30.5, y: 50.25})}, + } +} + +func jsonPhysicalTypeColumns(t *testing.T) []string { + t.Helper() + specs := jsonPhysicalTypeSpecs(t) + columns := make([]string, 0, len(specs)) + for _, spec := range specs { + columns = append(columns, spec.name) + } + return columns +} + +func makeJSONPhysicalTypesRecord(t *testing.T) arrow.RecordBatch { + t.Helper() + pool := memory.NewGoAllocator() + specs := jsonPhysicalTypeSpecs(t) + fields := []arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + } + for _, spec := range specs { + field := arrow.Field{Name: spec.name, Type: spec.dataType, Nullable: false} + if spec.arrowExtension != "" { + field.Metadata = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": spec.arrowExtension}) + } + fields = append(fields, field) + } + schema := arrow.NewSchema(fields, nil) + + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("json-physical-types") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(1) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(true) + for _, spec := range specs { + spec.appendValue(t, recordFieldBuilder(t, b, spec.name)) + } + return b.NewRecordBatch() +} + +func appendJSONText(value string) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + switch b := builder.(type) { + case *array.StringBuilder: + b.Append(value) + case *array.LargeStringBuilder: + b.Append(value) + case *array.StringViewBuilder: + b.Append(value) + case *array.BinaryBuilder: + b.Append([]byte(value)) + case *array.BinaryViewBuilder: + b.Append([]byte(value)) + default: + require.Failf(t, "unsupported JSON text builder", "got %T", builder) + } + } +} + +func appendJSONStruct(kind string, count int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + structBuilder, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + structBuilder.Append(true) + structBuilder.FieldBuilder(jsonStructKindField).(*array.StringBuilder).Append(kind) + structBuilder.FieldBuilder(jsonStructCountField).(*array.Int64Builder).Append(count) + } +} + +func appendInt64JSONList(values ...int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + switch b := builder.(type) { + case *array.ListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.LargeListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.FixedSizeListBuilder: + b.Append(true) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.ListViewBuilder: + b.AppendWithSize(true, len(values)) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + case *array.LargeListViewBuilder: + b.AppendWithSize(true, len(values)) + b.ValueBuilder().(*array.Int64Builder).AppendValues(values, nil) + default: + require.Failf(t, "unsupported JSON list builder", "got %T", builder) + } + } +} + +func appendInt64JSONMap(keys []string, values []int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + mapBuilder, ok := builder.(*array.MapBuilder) + require.Truef(t, ok, "got %T, want *array.MapBuilder", builder) + mapBuilder.Append(true) + mapBuilder.KeyBuilder().(*array.StringBuilder).AppendValues(keys, nil) + mapBuilder.ItemBuilder().(*array.Int64Builder).AppendValues(values, nil) + } +} + +func appendInt64JSONScalar(value int64) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + intBuilder, ok := builder.(*array.Int64Builder) + require.Truef(t, ok, "got %T, want *array.Int64Builder", builder) + intBuilder.Append(value) + } +} + +func appendArrowJSONText(value string) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + extensionBuilder, ok := builder.(*array.ExtensionBuilder) + require.Truef(t, ok, "got %T, want *array.ExtensionBuilder", builder) + extensionBuilder.StorageBuilder().(*array.StringBuilder).Append(value) + } +} + +func appendGeoArrowJSONPoint(point xyPoint) func(*testing.T, array.Builder) { + return func(t *testing.T, builder array.Builder) { + t.Helper() + structBuilder, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + appendPoint(structBuilder, point) + } +} + +func jsonPhysicalTypesExpected(t *testing.T) map[string]any { + t.Helper() + expected := map[string]any{"name": "json-physical-types"} + for _, spec := range jsonPhysicalTypeSpecs(t) { + expected[spec.name] = spec.expected + } + return expected +} + +func assertJSONPhysicalTypesReadThroughHugr(t *testing.T, service *hugr.Service, dsName string) { + t.Helper() + query := fmt.Sprintf(`{ + %s { + events(filter: {name: {eq: "json-physical-types"}}) { + name + %s + } + } + }`, dsName, strings.Join(jsonPhysicalTypeColumns(t), "\n")) + res, err := service.Query(context.Background(), query, nil) + require.NoError(t, err) + defer res.Close() + require.NoErrorf(t, res.Err(), "graphql error for query: %s", query) + + body, err := json.Marshal(res) + require.NoError(t, err) + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + data := payload["data"].(map[string]any) + root := data[dsName].(map[string]any) + rows := root["events"].([]any) + require.Len(t, rows, 1, "response: %s", string(body)) + assert.Equal(t, jsonPhysicalTypesExpected(t), rows[0]) +} + +// --- Tests ---------------------------------------------------------------- + +func TestIngest_Postgres_RoundTrip(t *testing.T) { + env := setupEnv(t) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"alpha", "beta", "gamma"}, + []float64{1.5, 2.5, 3.5}, + []bool{true, false, true}, + []string{`{"k":"v"}`, "", `{"x":1}`}, + []arrow.Timestamp{now, now, now}, + ) + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, "pg_ingest.events", res.DataObject) + assert.Equal(t, int64(3), res.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, res.Columns) + + // Verify by reading directly from postgres. + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 3, count) + + rows, err := env.pgConn.Query("SELECT name, value, is_active, payload IS NOT NULL FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + var ( + gotNames []string + gotValues []float64 + gotActive []bool + gotHasJSON []bool + ) + for rows.Next() { + var n string + var v float64 + var a, j bool + require.NoError(t, rows.Scan(&n, &v, &a, &j)) + gotNames = append(gotNames, n) + gotValues = append(gotValues, v) + gotActive = append(gotActive, a) + gotHasJSON = append(gotHasJSON, j) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"alpha", "beta", "gamma"}, gotNames) + assert.Equal(t, []float64{1.5, 2.5, 3.5}, gotValues) + assert.Equal(t, []bool{true, false, true}, gotActive) + assert.Equal(t, []bool{true, false, true}, gotHasJSON) // beta has NULL payload +} + +func TestIngest_Postgres_JSONPhysicalTypes(t *testing.T) { + env := setupEnv(t) + rec := makeJSONPhysicalTypesRecord(t) + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.NoError(t, err) + assert.Equal(t, int64(1), res.Inserted) + expectedColumns := append([]string{"name", "value", "is_active"}, jsonPhysicalTypeColumns(t)...) + assert.ElementsMatch(t, expectedColumns, res.Columns) + assertJSONPhysicalTypesReadThroughHugr(t, env.service, env.dsName) +} + +func TestIngest_Postgres_RejectsMalformedJSON(t *testing.T) { + for _, tt := range []struct { + name string + binary bool + }{ + {name: "string"}, + {name: "binary", binary: true}, + } { + t.Run(tt.name, func(t *testing.T) { + env := setupEnv(t) + rec := makeMalformedJSONRecord(t, tt.binary) + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.Error(t, err) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Zero(t, count, "a failed JSON cast must roll back the entire ingest") + }) + } +} + +func TestIngest_Postgres_UsesBinaryCopyWithoutTextOnlyTypes(t *testing.T) { + env := setupEnv(t) + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + { + Name: "geom", + Type: arrow.BinaryTypes.String, + Nullable: false, + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"}), + }, + }, nil) + b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append("binary-copy") + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(42) + recordFieldBuilder(t, b, "geom").(*array.StringBuilder).Append("POINT (7.25 8.5)") + rec := b.NewRecordBatch() + b.Release() + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "pg_ingest.binary_events", rec) + require.NoError(t, err) + assert.Equal(t, int64(1), res.Inserted) + + var name, geom string + require.NoError(t, env.pgConn.QueryRow( + "SELECT name, ST_AsText(geom) FROM binary_events", + ).Scan(&name, &geom)) + assert.Equal(t, "binary-copy", name) + assert.Equal(t, "POINT(7.25 8.5)", compactWKT(geom)) + + const copyPrefix = `COPY "public"."binary_events"` + var serverLog string + require.Eventually(t, func() bool { + err := env.pgConn.QueryRow("SELECT pg_read_file(pg_current_logfile())").Scan(&serverLog) + return err == nil && strings.Contains(serverLog, copyPrefix) && + strings.Contains(serverLog[strings.LastIndex(serverLog, copyPrefix):], "FORMAT BINARY") + }, 5*time.Second, 100*time.Millisecond, "postgres log did not contain binary COPY for binary_events") +} + +// TestIngest_Postgres_GeometryEdgeCases verifies that the native +// DuckDB GEOMETRY -> PostGIS bridge faithfully carries geometries that the +// existing suite never exercised: SQL NULL, 3D (Z) coordinates, EMPTY +// geometries and a mixed GEOMETRYCOLLECTION. The target column is a bare +// `geometry` (no typmod) so PostGIS accepts any type/dimension and the +// assertions reflect exactly what crossed the bridge — not what a typmod +// coerced. Geometry is sent as geoarrow.wkt so DuckDB staging normalises it to +// a canonical GEOMETRY via ST_GeomFromText before the bridge writes it out. +func TestIngest_Postgres_GeometryEdgeCases(t *testing.T) { + env := setupEnv(t) + + _, err := env.pgConn.ExecContext(context.Background(), + "TRUNCATE TABLE geom_edge RESTART IDENTITY") + require.NoError(t, err) + + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + { + Name: "geom", + Type: arrow.BinaryTypes.String, + Nullable: true, + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "geoarrow.wkt"}), + }, + }, nil) + + b := array.NewRecordBuilder(memory.NewGoAllocator(), schema) + names := recordFieldBuilder(t, b, "name").(*array.StringBuilder) + geoms := recordFieldBuilder(t, b, "geom").(*array.StringBuilder) + + names.Append("a_null") + geoms.AppendNull() + names.Append("b_point_z") + geoms.Append("POINT Z (1 2 3)") + names.Append("c_empty_point") + geoms.Append("POINT EMPTY") + names.Append("d_geomcollection") + geoms.Append("GEOMETRYCOLLECTION(POINT(1 2),LINESTRING(0 0,1 1))") + + rec := b.NewRecordBatch() + b.Release() + defer rec.Release() + + res, err := env.client.IngestRecord(context.Background(), "pg_ingest.geom_edge", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(4), res.Inserted) + + type edgeRow struct { + isNull bool + gtype string + zmflag int + isEmpty bool + numGeom int + } + rows, err := env.pgConn.Query(` + SELECT name, + geom IS NULL, + COALESCE(GeometryType(geom), ''), + COALESCE(ST_Zmflag(geom), -1), + COALESCE(ST_IsEmpty(geom), false), + COALESCE(ST_NumGeometries(geom), 0) + FROM geom_edge ORDER BY name`) + require.NoError(t, err) + defer rows.Close() + + got := map[string]edgeRow{} + for rows.Next() { + var name string + var r edgeRow + require.NoError(t, rows.Scan(&name, &r.isNull, &r.gtype, &r.zmflag, &r.isEmpty, &r.numGeom)) + got[name] = r + } + require.NoError(t, rows.Err()) + require.Len(t, got, 4) + + // NULL geometry must round-trip as SQL NULL. + assert.True(t, got["a_null"].isNull, "NULL geometry must stay NULL through the native bridge") + + // 3D point: the Z dimension must survive DuckDB GEOMETRY -> PostGIS. + assert.False(t, got["b_point_z"].isNull) + assert.Equal(t, "POINT", got["b_point_z"].gtype) + assert.Equal(t, 2, got["b_point_z"].zmflag, "ST_Zmflag 2 == XYZ (Z present, no M)") + + // EMPTY geometry must remain an empty geometry of the right type. + assert.Equal(t, "POINT", got["c_empty_point"].gtype) + assert.True(t, got["c_empty_point"].isEmpty, "POINT EMPTY must survive as empty") + + // Mixed GeometryCollection must keep its member count. + assert.Equal(t, "GEOMETRYCOLLECTION", got["d_geomcollection"].gtype) + assert.Equal(t, 2, got["d_geomcollection"].numGeom) + + // Exact coordinates for the 3D point. + var x, y, z float64 + require.NoError(t, env.pgConn.QueryRow( + "SELECT ST_X(geom), ST_Y(geom), ST_Z(geom) FROM geom_edge WHERE name = 'b_point_z'", + ).Scan(&x, &y, &z)) + assert.Equal(t, [3]float64{1, 2, 3}, [3]float64{x, y, z}) +} + +func TestIngest_Postgres_PermissionData(t *testing.T) { + env := setupEnv(t) + + const ownerID = 4343 + role := "ingest_perm_pg" + registerIngestPermissionRole(t, env.service, role, moduleMutationName(env.dsName)) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"perm-alpha", "perm-beta"}, + []float64{11.5, 12.5}, + []bool{true, true}, + []string{"", ""}, + []arrow.Timestamp{now, now}, + ) + defer rec.Release() + + permClient := hugrclient.NewClient(env.server.URL+"/ipc", + hugrclient.WithApiKey(ingestTestAPIKey), + hugrclient.WithUserRole(role), + hugrclient.WithUserInfo(strconv.Itoa(ownerID), "permission-user"), + ) + res, err := permClient.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + assert.NotContains(t, res.Columns, "owner_id", "owner_id must be injected by permissions, not sent in Arrow") + + rows, err := env.pgConn.Query("SELECT name, owner_id FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]int64{} + for rows.Next() { + var ( + name string + ownerID int64 + ) + require.NoError(t, rows.Scan(&name, &ownerID)) + got[name] = ownerID + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]int64{ + "perm-alpha": ownerID, + "perm-beta": ownerID, + }, got) +} + +func TestIngest_Postgres_PermissionDataGeometry(t *testing.T) { + env := setupEnv(t) + + role := "ingest_perm_geom_pg" + registerIngestPermissionRoleData(t, env.service, role, moduleMutationName(env.dsName), map[string]any{ + "geom": "POINT (7.25 8.5)", + }) + + now := arrow.Timestamp(time.Date(2026, 5, 21, 12, 0, 0, 0, time.UTC).UnixMicro()) + rec := makeEventsRecord(t, + []string{"perm-geom-alpha", "perm-geom-beta"}, + []float64{21.5, 22.5}, + []bool{true, true}, + []string{"", ""}, + []arrow.Timestamp{now, now}, + ) + defer rec.Release() + + permClient := hugrclient.NewClient(env.server.URL+"/ipc", + hugrclient.WithApiKey(ingestTestAPIKey), + hugrclient.WithUserRole(role), + hugrclient.WithUserInfo("7", "permission-geometry-user"), + ) + res, err := permClient.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + assert.NotContains(t, res.Columns, "geom", "geom must be injected by permissions, not sent in Arrow") + + rows, err := env.pgConn.Query("SELECT name, ST_AsText(geom), ST_SRID(geom) FROM events ORDER BY name") + require.NoError(t, err) + defer rows.Close() + + got := map[string]string{} + gotSRID := map[string]int{} + for rows.Next() { + var name, geom string + var srid int + require.NoError(t, rows.Scan(&name, &geom, &srid)) + got[name] = compactWKT(geom) + gotSRID[name] = srid + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string]string{ + "perm-geom-alpha": "POINT(7.25 8.5)", + "perm-geom-beta": "POINT(7.25 8.5)", + }, got) + assert.Equal(t, map[string]int{ + "perm-geom-alpha": 0, + "perm-geom-beta": 0, + }, gotSRID) +} + +func TestIngest_Postgres_MultipleBatches(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + mk := func(names []string) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + fields := eventsRecordBuildersFor(b) + fields.names.AppendValues(names, nil) + vals := make([]float64, len(names)) + for i := range vals { + vals[i] = float64(i) + } + fields.values.AppendValues(vals, nil) + active := make([]bool, len(names)) + for i := range active { + active[i] = true + } + fields.active.AppendValues(active, nil) + fields.payloads.AppendNulls(len(names)) + ts := make([]arrow.Timestamp, len(names)) + for i := range ts { + ts[i] = arrow.Timestamp(time.Now().UTC().UnixMicro()) + } + fields.createdAt.AppendValues(ts, nil) + return b.NewRecordBatch() + } + rec1 := mk([]string{"a", "b"}) + defer rec1.Release() + rec2 := mk([]string{"c", "d", "e"}) + defer rec2.Release() + + rr, err := array.NewRecordReader(schema, []arrow.RecordBatch{rec1, rec2}) + require.NoError(t, err) + defer rr.Release() + + res, err := env.client.Ingest(context.Background(), "pg_ingest.events", rr) + require.NoError(t, err) + assert.Equal(t, int64(5), res.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 5, count) +} + +// TestIngest_Postgres_Bulk exercises the typed Go client at real-world scale: +// 50 batches × 1000 rows streamed through array.RecordReader, never +// materialised in memory beyond the current batch. Mirrors the wire-level +// bulk path in TestIngest_HTTP_Direct, but goes through hugrclient.Client. +func TestIngest_Postgres_Bulk(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + ) + + reader := newLazyEventsReader( + memory.NewGoAllocator(), + numBatches, rowsPerBatch, + time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC), + ) + defer reader.Release() + + start := time.Now() + res, err := env.client.Ingest(context.Background(), "pg_ingest.events", reader) + elapsed := time.Since(start) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(totalRows), res.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, res.Columns) + + // Time the COUNT(*) immediately after the POST returns. If the server + // were lying / writing asynchronously, this query would either be slow + // (waiting for in-flight writes to land) or return a partial value. + countStart := time.Now() + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible the moment POST returns") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + // Spot-check the first five rows for per-column fidelity through the + // client → server → DuckDB → postgres-extension → Postgres pipeline. + rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) + require.NoError(t, err) + defer rows.Close() + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) + for rows.Next() { + var n string + var v float64 + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"evt-000000", "evt-000001", "evt-000002", "evt-000003", "evt-000004"}, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + // row%5 == 0 ⇒ payload IS NULL; in the first five rows that's just row 0. + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + var activeCount int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + + t.Logf("bulk ingest via Go client: %d rows in %d batches in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_Postgres_Stream covers Client.IngestStream — the low-level API +// that takes a raw Arrow IPC stream as io.Reader. We serialise a buffer +// ourselves and verify it lands in Postgres. +func TestIngest_Postgres_Stream(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues([]string{"s1", "s2"}, nil) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).AppendValues([]float64{10, 20}, nil) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).AppendValues([]bool{true, false}, nil) + rec := b.NewRecordBatch() + b.Release() + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + res, err := env.client.IngestStream(context.Background(), "pg_ingest.events", &buf) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(2), res.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 2, count) +} + +// TestIngest_Postgres_Stream_Empty checks that IngestStream rejects nil body +// without sending anything to the server. +func TestIngest_Postgres_Stream_Empty(t *testing.T) { + env := setupEnv(t) + _, err := env.client.IngestStream(context.Background(), "pg_ingest.events", nil) + require.Error(t, err) + assert.Contains(t, err.Error(), "body is nil") + + _, err = env.client.IngestStream(context.Background(), "", bytes.NewReader([]byte{})) + require.Error(t, err) + assert.Contains(t, err.Error(), "data_object") +} + +// arrowFileFormat picks between Arrow IPC stream (no magic) and Arrow IPC +// file (ARROW1 prefix) for the writeEventsArrowFile helper. +type arrowFileFormat int + +const ( + arrowStreamFormat arrowFileFormat = iota + arrowFileFmt +) + +func eventsArrowSchema() *arrow.Schema { + return arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) +} + +func eventsArrowFileSchema() *arrow.Schema { + fields := append([]arrow.Field{}, eventsArrowSchema().Fields()...) + fields = append(fields, geometryArrowFields()...) + return arrow.NewSchema(fields, nil) +} + +type geometryTypesRow struct { + name string + value float64 + active bool + point xyPoint + shapeOrigin xyPoint +} + +func makeGeometryTypesRecord(t *testing.T, rows []geometryTypesRow) (arrow.RecordBatch, *arrow.Schema) { + t.Helper() + + schema := geometryTypesSchema() + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + for _, row := range rows { + appendGeometryTypesRow(t, b, row) + } + + return b.NewRecordBatch(), schema +} + +func geometryTypesSchema() *arrow.Schema { + fields := []arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + } + fields = append(fields, geometryArrowFields()...) + return arrow.NewSchema(fields, nil) +} + +func geometryArrowFields() []arrow.Field { + pointType := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + lineType := arrow.ListOf(pointType) + polygonType := arrow.ListOf(lineType) + fields := make([]arrow.Field, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + field := arrow.Field{ + Name: col.name, + Type: col.arrowType, + Nullable: false, + } + if col.arrowExtension != "" { + field.Metadata = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": col.arrowExtension}) + } + fields = append(fields, field) + } + return fields +} + +type geometryValueColumn struct { + name string + arrowType arrow.DataType + arrowExtension string + expectedWKT func(point, x, y string) string + expectedSRID int +} + +func geometryValueColumns(pointType, lineType, polygonType arrow.DataType) []geometryValueColumn { + geoJSONStructType := arrow.StructOf( + arrow.Field{Name: "type", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "coordinates", Type: arrow.ListOf(arrow.ListOf(arrow.ListOf(arrow.PrimitiveTypes.Float64))), Nullable: false}, + ) + line := func(_ string, x string, y string) string { + return fmt.Sprintf("LINESTRING(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 1)) + } + polygon := func(_ string, x string, y string) string { return polygonWKT(x, y) } + point := func(point string, _ string, _ string) string { return point } + multiPoint := func(_ string, x string, y string) string { + return fmt.Sprintf("MULTIPOINT(%s %s,%s %s,%s %s)", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), y) + } + multiLine := func(_ string, x string, y string) string { + return fmt.Sprintf("MULTILINESTRING((%s %s,%s %s),(%s %s,%s %s))", x, y, addCoord(x, 1), addCoord(y, 1), addCoord(x, 2), addCoord(y, 2), addCoord(x, 3), addCoord(y, 3)) + } + multiPolygon := func(_ string, x string, y string) string { return multiPolygonWKT(x, y) } + + return []geometryValueColumn{ + {name: "geom", arrowType: pointType, arrowExtension: "geoarrow.point", expectedWKT: point}, + {name: "geom_4326", arrowType: pointType, arrowExtension: "geoarrow.point", expectedWKT: point, expectedSRID: 4326}, + {name: "geom_wkt", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.wkt", expectedWKT: line}, + {name: "geom_wkt_4326", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.wkt", expectedWKT: line, expectedSRID: 4326}, + {name: "geom_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "geoarrow.geojson", expectedWKT: polygon}, + {name: "geom_hugr_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "hugr.geojson", expectedWKT: polygon}, + {name: "geom_plain_geojson", arrowType: arrow.BinaryTypes.String, arrowExtension: "geojson", expectedWKT: polygon}, + {name: "geom_geojson_struct", arrowType: geoJSONStructType, expectedWKT: polygon}, + {name: "geom_geojson_arrow_json", arrowType: mustArrowJSONType(), arrowExtension: "arrow.json", expectedWKT: polygon}, + {name: "geom_wkb", arrowType: arrow.BinaryTypes.Binary, arrowExtension: "geoarrow.wkb", expectedWKT: point}, + {name: "geom_hexwkb", arrowType: arrow.BinaryTypes.String, arrowExtension: "hugr.hexwkb", expectedWKT: point}, + {name: "geom_line", arrowType: lineType, arrowExtension: "geoarrow.linestring", expectedWKT: line}, + {name: "geom_polygon_native", arrowType: polygonType, arrowExtension: "geoarrow.polygon", expectedWKT: polygon}, + {name: "geom_multipoint", arrowType: lineType, arrowExtension: "geoarrow.multipoint", expectedWKT: multiPoint}, + {name: "geom_multiline", arrowType: polygonType, arrowExtension: "geoarrow.multilinestring", expectedWKT: multiLine}, + {name: "geom_multipolygon", arrowType: arrow.ListOf(polygonType), arrowExtension: "geoarrow.multipolygon", expectedWKT: multiPolygon}, + } +} + +func mustArrowJSONType() arrow.DataType { + typ, err := extensions.NewJSONType(arrow.BinaryTypes.String) + if err != nil { + panic(err) + } + return typ +} + +func geometryTypesColumns() []string { + pointType, lineType, polygonType := geometryArrowTypes() + columns := []string{"name", "value", "is_active"} + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + columns = append(columns, col.name) + } + return columns +} + +func geometryExpected(point, x, y string) []string { + pointType, lineType, polygonType := geometryArrowTypes() + values := make([]string, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + values = append(values, col.expectedWKT(point, x, y)) + } + return values +} + +func geometrySRIDExpected() []int { + pointType, lineType, polygonType := geometryArrowTypes() + srids := make([]int, 0, len(geometryValueColumns(pointType, lineType, polygonType))) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + srids = append(srids, col.expectedSRID) + } + return srids +} + +func geometryArrowTypes() (pointType, lineType, polygonType arrow.DataType) { + pointType = arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + ) + lineType = arrow.ListOf(pointType) + polygonType = arrow.ListOf(lineType) + return pointType, lineType, polygonType +} + +func geometrySelectList(withSRID bool) string { + pointType, lineType, polygonType := geometryArrowTypes() + exprs := make([]string, 0, len(geometryValueColumns(pointType, lineType, polygonType))*2) + for _, col := range geometryValueColumns(pointType, lineType, polygonType) { + exprs = append(exprs, "ST_AsText("+col.name+")") + if withSRID { + exprs = append(exprs, "ST_SRID("+col.name+")") + } + } + return strings.Join(exprs, ",\n") +} + +type sqlScanner interface { + Scan(dest ...any) error +} + +func scanGeometryValuesWithSRID(t *testing.T, scanner sqlScanner) ([]string, []int) { + t.Helper() + pointType, lineType, polygonType := geometryArrowTypes() + columns := geometryValueColumns(pointType, lineType, polygonType) + values := make([]string, len(columns)) + srids := make([]int, len(columns)) + scanArgs := make([]any, 0, len(columns)*2) + for i := range columns { + scanArgs = append(scanArgs, &values[i], &srids[i]) + } + require.NoError(t, scanner.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + return values, srids +} + +func scanNamedGeometryValuesWithSRID(t *testing.T, rows *sql.Rows) (string, []string, []int) { + t.Helper() + pointType, lineType, polygonType := geometryArrowTypes() + columns := geometryValueColumns(pointType, lineType, polygonType) + var name string + values := make([]string, len(columns)) + srids := make([]int, len(columns)) + scanArgs := []any{&name} + for i := range columns { + scanArgs = append(scanArgs, &values[i], &srids[i]) + } + require.NoError(t, rows.Scan(scanArgs...)) + for i := range values { + values[i] = compactWKT(values[i]) + } + return name, values, srids +} + +func polygonWKT(x, y string) string { + return fmt.Sprintf("POLYGON((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s))", + x, y, + x, addCoord(y, 4), + addCoord(x, 4), addCoord(y, 4), + addCoord(x, 4), y, + x, y, + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 1), + ) +} + +func multiPolygonWKT(x, y string) string { + return fmt.Sprintf("MULTIPOLYGON(((%s %s,%s %s,%s %s,%s %s,%s %s),(%s %s,%s %s,%s %s,%s %s,%s %s)),((%s %s,%s %s,%s %s,%s %s,%s %s)))", + x, y, + x, addCoord(y, 4), + addCoord(x, 4), addCoord(y, 4), + addCoord(x, 4), y, + x, y, + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 1), + addCoord(x, 2), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 2), + addCoord(x, 1), addCoord(y, 1), + addCoord(x, 10), addCoord(y, 10), + addCoord(x, 10), addCoord(y, 12), + addCoord(x, 12), addCoord(y, 12), + addCoord(x, 12), addCoord(y, 10), + addCoord(x, 10), addCoord(y, 10), + ) +} + +func assertGeometryReadThroughHugr(t *testing.T, service *hugr.Service, dsName, filter string, expected []map[string]any) { + t.Helper() + + query := fmt.Sprintf(`{ + %s { + events(%s, order_by: [{field: "name", direction: ASC}]) { + name + geom + geom_4326 + geom_wkt + geom_wkt_4326 + geom_geojson + geom_hugr_geojson + geom_plain_geojson + geom_geojson_struct + geom_geojson_arrow_json + geom_wkb + geom_hexwkb + geom_line + geom_polygon_native + geom_multipoint + geom_multiline + geom_multipolygon + } + } + }`, dsName, filter) + + res, err := service.Query(context.Background(), query, nil) + require.NoError(t, err) + defer res.Close() + require.NoErrorf(t, res.Err(), "graphql error for query: %s", query) + + body, err := json.Marshal(res) + require.NoError(t, err) + + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + data, ok := payload["data"].(map[string]any) + require.True(t, ok, "response data must be an object: %s", string(body)) + root, ok := data[dsName].(map[string]any) + require.True(t, ok, "response data.%s must be an object: %s", dsName, string(body)) + rawRows, ok := root["events"].([]any) + require.True(t, ok, "response data.%s.events must be an array: %s", dsName, string(body)) + + got := make([]map[string]any, 0, len(rawRows)) + for _, raw := range rawRows { + row, ok := raw.(map[string]any) + require.True(t, ok, "event row must be an object: %#v", raw) + got = append(got, row) + } + assert.Equal(t, expected, got) +} + +func geometryReadExpected(name string, point xyPoint, x, y float64) map[string]any { + return map[string]any{ + "name": name, + "geom": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_4326": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_wkt": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_wkt_4326": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_hugr_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_plain_geojson": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_geojson_struct": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_geojson_arrow_json": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_wkb": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_hexwkb": geoJSONGeometry("Point", pointCoordinate(point)), + "geom_line": geoJSONGeometry("LineString", pointCoordinates(linePoints(x, y))), + "geom_polygon_native": geoJSONGeometry("Polygon", nestedPointCoordinates(polygonRings(x, y))), + "geom_multipoint": geoJSONGeometry("MultiPoint", pointCoordinates(multiPoints(x, y))), + "geom_multiline": geoJSONGeometry("MultiLineString", nestedPointCoordinates(multiLines(x, y))), + "geom_multipolygon": geoJSONGeometry("MultiPolygon", deepPointCoordinates(multiPolygons(x, y))), + } +} + +func geoJSONGeometry(typ string, coordinates any) map[string]any { + return map[string]any{ + "type": typ, + "coordinates": coordinates, + } +} + +func pointCoordinate(point xyPoint) []any { + return []any{point.x, point.y} +} + +func pointCoordinates(points []xyPoint) []any { + coords := make([]any, 0, len(points)) + for _, point := range points { + coords = append(coords, pointCoordinate(point)) + } + return coords +} + +func nestedPointCoordinates(lines [][]xyPoint) []any { + coords := make([]any, 0, len(lines)) + for _, line := range lines { + coords = append(coords, pointCoordinates(line)) + } + return coords +} + +func deepPointCoordinates(polygons [][][]xyPoint) []any { + coords := make([]any, 0, len(polygons)) + for _, polygon := range polygons { + coords = append(coords, nestedPointCoordinates(polygon)) + } + return coords +} + +func addCoord(v string, delta float64) string { + f, err := strconv.ParseFloat(v, 64) + if err != nil { + panic(err) + } + return coord(f + delta) +} + +func compactWKT(s string) string { + s = strings.ReplaceAll(s, ", ", ",") + s = strings.ReplaceAll(s, " (", "(") + if strings.HasPrefix(s, "MULTIPOINT((") && strings.HasSuffix(s, "))") { + inner := strings.TrimSuffix(strings.TrimPrefix(s, "MULTIPOINT(("), "))") + s = "MULTIPOINT(" + strings.ReplaceAll(inner, "),(", ",") + ")" + } + return s +} + +func buildGeometryTypesBatch(t *testing.T, pool memory.Allocator, schema *arrow.Schema, batchIdx, rowsPerBatch int, namePrefix string) arrow.RecordBatch { + t.Helper() + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + name, point := geometryBatchRow(namePrefix, row) + appendGeometryTypesRow(t, b, geometryTypesRow{ + name: name, + value: float64(row) * 0.5, + active: row%2 == 0, + point: point, + shapeOrigin: point, + }) + } + return b.NewRecordBatch() +} + +func geometryBatchRow(namePrefix string, row int) (string, xyPoint) { + return fmt.Sprintf("%s-%06d", namePrefix, row), xyPoint{ + x: float64(row % 100), + y: float64(row / 1000), + } +} + +func appendGeometryTypesRow(t *testing.T, b *array.RecordBuilder, row geometryTypesRow) { + t.Helper() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).Append(row.name) + recordFieldBuilder(t, b, "value").(*array.Float64Builder).Append(row.value) + recordFieldBuilder(t, b, "is_active").(*array.BooleanBuilder).Append(row.active) + appendGeometryValueFields(t, b, row) +} + +func appendGeometryValueFields(t *testing.T, b *array.RecordBuilder, row geometryTypesRow) { + t.Helper() + x, y := row.shapeOrigin.x, row.shapeOrigin.y + + appendPoint(recordFieldBuilder(t, b, "geom").(*array.StructBuilder), row.point) + appendPoint(recordFieldBuilder(t, b, "geom_4326").(*array.StructBuilder), row.point) + recordFieldBuilder(t, b, "geom_wkt").(*array.StringBuilder).Append(lineWKT(x, y)) + recordFieldBuilder(t, b, "geom_wkt_4326").(*array.StringBuilder).Append(lineWKT(x, y)) + recordFieldBuilder(t, b, "geom_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + recordFieldBuilder(t, b, "geom_hugr_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + recordFieldBuilder(t, b, "geom_plain_geojson").(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + appendGeoJSONPolygonStruct(t, recordFieldBuilder(t, b, "geom_geojson_struct"), x, y) + recordFieldBuilder(t, b, "geom_geojson_arrow_json").(*array.ExtensionBuilder).StorageBuilder().(*array.StringBuilder).Append(polygonGeoJSON(x, y)) + + wkbPoint, err := wkb.Marshal(orb.Point{row.point.x, row.point.y}) + require.NoError(t, err) + recordFieldBuilder(t, b, "geom_wkb").(*array.BinaryBuilder).Append(wkbPoint) + recordFieldBuilder(t, b, "geom_hexwkb").(*array.StringBuilder).Append(strings.ToUpper(hex.EncodeToString(wkbPoint))) + appendPointList(recordFieldBuilder(t, b, "geom_line").(*array.ListBuilder), linePoints(x, y)) + appendPointListList(recordFieldBuilder(t, b, "geom_polygon_native").(*array.ListBuilder), polygonRings(x, y)) + appendPointList(recordFieldBuilder(t, b, "geom_multipoint").(*array.ListBuilder), multiPoints(x, y)) + appendPointListList(recordFieldBuilder(t, b, "geom_multiline").(*array.ListBuilder), multiLines(x, y)) + appendPointListListList(recordFieldBuilder(t, b, "geom_multipolygon").(*array.ListBuilder), multiPolygons(x, y)) +} + +type xyPoint struct { + x float64 + y float64 +} + +const ( + geoArrowPointXField = iota + geoArrowPointYField +) + +const ( + geoJSONGeometryTypeField = iota + geoJSONGeometryCoordinatesField +) + +func appendPoint(sb *array.StructBuilder, point xyPoint) { + sb.Append(true) + sb.FieldBuilder(geoArrowPointXField).(*array.Float64Builder).Append(point.x) + sb.FieldBuilder(geoArrowPointYField).(*array.Float64Builder).Append(point.y) +} + +func appendGeoJSONPolygonStruct(t *testing.T, builder array.Builder, x, y float64) { + t.Helper() + sb, ok := builder.(*array.StructBuilder) + require.Truef(t, ok, "got %T, want *array.StructBuilder", builder) + + sb.Append(true) + sb.FieldBuilder(geoJSONGeometryTypeField).(*array.StringBuilder).Append("Polygon") + appendGeoJSONPolygonCoordinates(sb.FieldBuilder(geoJSONGeometryCoordinatesField).(*array.ListBuilder), polygonRings(x, y)) +} + +func appendGeoJSONPolygonCoordinates(lb *array.ListBuilder, rings [][]xyPoint) { + lb.Append(true) + ringBuilder := lb.ValueBuilder().(*array.ListBuilder) + for _, ring := range rings { + ringBuilder.Append(true) + pointBuilder := ringBuilder.ValueBuilder().(*array.ListBuilder) + for _, point := range ring { + pointBuilder.Append(true) + pointBuilder.ValueBuilder().(*array.Float64Builder).AppendValues([]float64{point.x, point.y}, nil) + } + } +} + +func appendPointList(lb *array.ListBuilder, points []xyPoint) { + lb.Append(true) + sb := lb.ValueBuilder().(*array.StructBuilder) + for _, point := range points { + appendPoint(sb, point) + } +} + +func appendPointListList(lb *array.ListBuilder, lines [][]xyPoint) { + lb.Append(true) + inner := lb.ValueBuilder().(*array.ListBuilder) + for _, points := range lines { + appendPointList(inner, points) + } +} + +func appendPointListListList(lb *array.ListBuilder, polygons [][][]xyPoint) { + lb.Append(true) + inner := lb.ValueBuilder().(*array.ListBuilder) + for _, rings := range polygons { + appendPointListList(inner, rings) + } +} + +func linePoints(x, y float64) []xyPoint { + return []xyPoint{{x: x, y: y}, {x: x + 1, y: y + 1}, {x: x + 2, y: y + 1}} +} + +func polygonRings(x, y float64) [][]xyPoint { + return [][]xyPoint{ + {{x: x, y: y}, {x: x, y: y + 4}, {x: x + 4, y: y + 4}, {x: x + 4, y: y}, {x: x, y: y}}, + {{x: x + 1, y: y + 1}, {x: x + 2, y: y + 1}, {x: x + 2, y: y + 2}, {x: x + 1, y: y + 2}, {x: x + 1, y: y + 1}}, + } +} + +func multiPoints(x, y float64) []xyPoint { + return []xyPoint{{x: x, y: y}, {x: x + 1, y: y + 1}, {x: x + 2, y: y}} +} + +func multiLines(x, y float64) [][]xyPoint { + return [][]xyPoint{ + {{x: x, y: y}, {x: x + 1, y: y + 1}}, + {{x: x + 2, y: y + 2}, {x: x + 3, y: y + 3}}, + } +} + +func multiPolygons(x, y float64) [][][]xyPoint { + return [][][]xyPoint{ + polygonRings(x, y), + {{{x: x + 10, y: y + 10}, {x: x + 10, y: y + 12}, {x: x + 12, y: y + 12}, {x: x + 12, y: y + 10}, {x: x + 10, y: y + 10}}}, + } +} + +func lineWKT(x, y float64) string { + return fmt.Sprintf("LINESTRING (%s %s, %s %s, %s %s)", + coord(x), coord(y), + coord(x+1), coord(y+1), + coord(x+2), coord(y+1)) +} + +func pointWKT(point xyPoint) string { + return fmt.Sprintf("POINT(%s %s)", coord(point.x), coord(point.y)) +} + +func polygonGeoJSON(x, y float64) string { + return fmt.Sprintf(`{"type":"Polygon","coordinates":[[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]],[[%s,%s],[%s,%s],[%s,%s],[%s,%s],[%s,%s]]]}`, + coord(x), coord(y), + coord(x), coord(y+4), + coord(x+4), coord(y+4), + coord(x+4), coord(y), + coord(x), coord(y), + coord(x+1), coord(y+1), + coord(x+2), coord(y+1), + coord(x+2), coord(y+2), + coord(x+1), coord(y+2), + coord(x+1), coord(y+1)) +} + +func coord(v float64) string { + return strconv.FormatFloat(v, 'f', -1, 64) +} + +type arrowIPCRecordWriter interface { + Write(arrow.RecordBatch) error + Close() error +} + +func newArrowIPCRecordWriter(t *testing.T, f *os.File, schema *arrow.Schema, format arrowFileFormat) arrowIPCRecordWriter { + t.Helper() + + switch format { + case arrowStreamFormat: + return ipc.NewWriter(f, ipc.WithSchema(schema)) + case arrowFileFmt: + w, err := ipc.NewFileWriter(f, ipc.WithSchema(schema)) + require.NoError(t, err) + return w + default: + t.Fatalf("unknown arrow file format: %d", format) + return nil + } +} + +func writeArrowIPCFile(t *testing.T, path string, schema *arrow.Schema, format arrowFileFormat, numBatches int, buildBatch func(batchIdx int) arrow.RecordBatch) { + t.Helper() + + f, err := os.Create(path) + require.NoError(t, err) + defer f.Close() + + w := newArrowIPCRecordWriter(t, f, schema, format) + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rec := buildBatch(batchIdx) + require.NoError(t, w.Write(rec)) + rec.Release() + } + require.NoError(t, w.Close()) +} + +// writeEventsArrowFile produces an Arrow IPC file at path in the given +// format with numBatches × rowsPerBatch synthetic events rows. namePrefix is +// embedded in the `name` column so different tests can write to the same +// table without colliding on uniqueness assertions. +func writeEventsArrowFile(t *testing.T, path, namePrefix string, format arrowFileFormat, numBatches, rowsPerBatch int) { + t.Helper() + pool := memory.NewGoAllocator() + schema := eventsArrowFileSchema() + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + writeArrowIPCFile(t, path, schema, format, numBatches, func(batchIdx int) arrow.RecordBatch { + rb := array.NewRecordBuilder(pool, schema) + fields := eventsRecordBuildersFor(rb) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + name, point := geometryBatchRow(namePrefix, row) + fields.names.Append(name) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) + if row%5 == 0 { + fields.payloads.AppendNull() + } else { + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + appendGeometryValueFields(t, rb, geometryTypesRow{point: point, shapeOrigin: point}) + } + rec := rb.NewRecordBatch() + rb.Release() + return rec + }) +} + +// TestIngest_Postgres_ArrowIPCFile_StreamFormat builds a 50×1000-row Arrow +// IPC *stream* file on disk and ingests it via IngestArrowIPCFile. The +// client should detect "no ARROW1 magic" and byte-forward the file body +// straight into /ipc/ingest — the bulk path with zero re-serialisation. +func TestIngest_Postgres_ArrowIPCFile_StreamFormat(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "stream" + ) + + path := filepath.Join(t.TempDir(), "events_stream.arrows") + writeEventsArrowFile(t, path, namePrefix, arrowStreamFormat, numBatches, rowsPerBatch) + + // Sanity-check that the file is actually stream format (no ARROW1). + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.NotEqual(t, "ARROW1", string(head[:6]), "test setup must produce stream format (no ARROW1 magic)") + + start := time.Now() + res, err := env.client.IngestArrowIPCFile(context.Background(), "pg_ingest.events", path) + elapsed := time.Since(start) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(totalRows), res.Inserted) + + // Synchronicity check: COUNT(*) must see all rows the moment POST returns. + countStart := time.Now() + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible immediately") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + // Spot-check the first 5 rows by content (rows produced by namePrefix-N). + rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) + require.NoError(t, err) + defer rows.Close() + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) + for rows.Next() { + var n string + var v float64 + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{namePrefix + "-000000", namePrefix + "-000001", namePrefix + "-000002", namePrefix + "-000003", namePrefix + "-000004"}, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + // Active-row count guards against bit-packing artefacts across batches. + var activeCount int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + assertArrowIPCFileGeometry(t, env, namePrefix, totalRows) + + t.Logf("arrow ipc stream file ingest: %d rows from %d-batch file in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_Postgres_ArrowIPCFile_FileFormat builds a 50×1000-row Arrow IPC +// *file* format file (ARROW1 magic + random-access footer) on disk and +// ingests it via IngestArrowIPCFile. The client should detect the magic, +// open the file with ipc.FileReader, and re-emit as a stream to the server. +func TestIngest_Postgres_ArrowIPCFile_FileFormat(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "file" + ) + + path := filepath.Join(t.TempDir(), "events_file.arrow") + writeEventsArrowFile(t, path, namePrefix, arrowFileFmt, numBatches, rowsPerBatch) + + // Sanity-check that we actually wrote the file format (ARROW1 prefix). + head, err := os.ReadFile(path) + require.NoError(t, err) + require.GreaterOrEqual(t, len(head), 6) + assert.Equal(t, "ARROW1", string(head[:6]), "test setup must produce file format with ARROW1 magic") + + start := time.Now() + res, err := env.client.IngestArrowIPCFile(context.Background(), "pg_ingest.events", path) + elapsed := time.Since(start) + require.NoError(t, err) + require.NotNil(t, res) + assert.Equal(t, int64(totalRows), res.Inserted) + + // Synchronicity check. + countStart := time.Now() + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible immediately") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) + require.NoError(t, err) + defer rows.Close() + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) + for rows.Next() { + var n string + var v float64 + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{namePrefix + "-000000", namePrefix + "-000001", namePrefix + "-000002", namePrefix + "-000003", namePrefix + "-000004"}, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + var activeCount int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + assertArrowIPCFileGeometry(t, env, namePrefix, totalRows) + + t.Logf("arrow ipc file-format ingest: %d rows from %d-batch file in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +func assertArrowIPCFileGeometry(t *testing.T, env *ingestEnv, namePrefix string, totalRows int) { + t.Helper() + lastName, lastPoint := geometryBatchRow(namePrefix, totalRows-1) + values, srids := scanGeometryValuesWithSRID(t, env.pgConn.QueryRow(fmt.Sprintf(` + SELECT %s + FROM events + WHERE name = $1 + `, geometrySelectList(true)), lastName)) + assert.Equal(t, geometryExpected(pointWKT(lastPoint), coord(lastPoint.x), coord(lastPoint.y)), values) + assert.Equal(t, geometrySRIDExpected(), srids) + assertGeometryReadThroughHugr(t, env.service, env.dsName, fmt.Sprintf(`filter: { name: { eq: "%s" } }`, lastName), []map[string]any{ + geometryReadExpected(lastName, lastPoint, lastPoint.x, lastPoint.y), + }) +} + +// TestIngest_Postgres_ArrowIPCFile_NotFound checks that a missing file +// surfaces a clean error without touching the server. +func TestIngest_Postgres_ArrowIPCFile_NotFound(t *testing.T) { + env := setupEnv(t) + _, err := env.client.IngestArrowIPCFile(context.Background(), "pg_ingest.events", + filepath.Join(t.TempDir(), "does-not-exist.arrows")) + require.Error(t, err) +} + +// TestIngest_Postgres_LazyReader exercises NewLazyReader at scale: 50×1000 +// rows generated on demand by a closure, no boilerplate RecordReader +// implementation. Mirrors TestIngest_Postgres_Bulk to prove the helper is +// equivalent. +func TestIngest_Postgres_LazyReader(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + ) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + + batchIdx := 0 + reader := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if batchIdx >= numBatches { + return nil, nil + } + rb := array.NewRecordBuilder(pool, schema) + defer rb.Release() + fields := eventsRecordBuildersFor(rb) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + fields.names.Append(fmt.Sprintf("lz-%06d", row)) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) + if row%5 == 0 { + fields.payloads.AppendNull() + } else { + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + rec := rb.NewRecordBatch() + batchIdx++ + return rec, nil + }) + defer reader.Release() + + start := time.Now() + res, err := env.client.Ingest(context.Background(), "pg_ingest.events", reader) + elapsed := time.Since(start) + require.NoError(t, err) + assert.Equal(t, int64(totalRows), res.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, totalRows, count) + + t.Logf("lazy-reader bulk ingest: %d rows in %d batches in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// TestIngest_LazyReader_Termination is a unit-style test for NewLazyReader's +// termination semantics (no server / postgres needed): (nil, nil) ends the +// stream; (_, err) surfaces via Err(). +func TestIngest_LazyReader_Termination(t *testing.T) { + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + mk := func(v int32) arrow.RecordBatch { + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "x").(*array.Int32Builder).Append(v) + return b.NewRecordBatch() + } + + // Case 1: gen returns batches then nil — clean end-of-stream. + { + i := 0 + r := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i >= 3 { + return nil, nil + } + i++ + return mk(int32(i)), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + require.NoError(t, r.Err()) + assert.Equal(t, 3, seen) + assert.False(t, r.Next(), "Next after end-of-stream stays false") + } + + // Case 2: gen returns an error — surfaces via Err, terminates stream. + { + errBoom := errors.New("boom") + i := 0 + r := hugrclient.NewLazyReader(schema, func() (arrow.RecordBatch, error) { + if i == 2 { + return nil, errBoom + } + i++ + return mk(int32(i)), nil + }) + defer r.Release() + seen := 0 + for r.Next() { + seen++ + } + assert.Equal(t, 2, seen, "should yield batches before the failing call") + require.Error(t, r.Err()) + assert.ErrorIs(t, r.Err(), errBoom) + } +} + +func TestIngest_Postgres_UnknownColumn(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "not_a_column", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "name").(*array.StringBuilder).AppendValues([]string{"x"}, nil) + recordFieldBuilder(t, b, "not_a_column").(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecordBatch() + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), "pg_ingest.events", rec) + require.Error(t, err) + assert.Contains(t, err.Error(), "not_a_column") + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + assert.Equal(t, 0, count, "no rows should have been inserted on validation failure") +} + +func TestIngest_Postgres_UnknownDataObject(t *testing.T) { + env := setupEnv(t) + + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "x", Type: arrow.PrimitiveTypes.Int32, Nullable: false}, + }, nil) + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + recordFieldBuilder(t, b, "x").(*array.Int32Builder).AppendValues([]int32{1}, nil) + rec := b.NewRecordBatch() + defer rec.Release() + + _, err := env.client.IngestRecord(context.Background(), "pg_ingest.does_not_exist", rec) + require.Error(t, err) +} + +// TestIngest_HTTP_Direct exercises low-level HTTP behaviour that the typed +// client smoothes over: bad Content-Type, missing data_object, wrong method. +// It writes a small Arrow stream to validate request parsing of /ipc/ingest. +func TestIngest_HTTP_Direct(t *testing.T) { + env := setupEnv(t) + + // Missing data_object. + resp, err := http.Post(env.server.URL+"/ipc/ingest", "application/vnd.apache.arrow.stream", bytes.NewReader(nil)) + require.NoError(t, err) + b, _ := io.ReadAll(resp.Body) + resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) + + // Wrong method. + req, _ := http.NewRequest(http.MethodGet, env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", nil) + resp, err = http.DefaultClient.Do(req) + require.NoError(t, err) + resp.Body.Close() + assert.Equal(t, http.StatusMethodNotAllowed, resp.StatusCode) + + // Wrong content type. + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "text/plain", bytes.NewReader([]byte("hello"))) + require.NoError(t, err) + resp.Body.Close() + assert.Equal(t, http.StatusUnsupportedMediaType, resp.StatusCode) + + // Body is not a valid Arrow stream. + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", bytes.NewReader([]byte("not arrow"))) + require.NoError(t, err) + b, _ = io.ReadAll(resp.Body) + resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode, "body=%s", string(b)) + + // Happy-path direct POST returning JSON. + pool := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + }, nil) + bld := array.NewRecordBuilder(pool, schema) + recordFieldBuilder(t, bld, "name").(*array.StringBuilder).AppendValues([]string{"direct"}, nil) + recordFieldBuilder(t, bld, "value").(*array.Float64Builder).AppendValues([]float64{42}, nil) + recordFieldBuilder(t, bld, "is_active").(*array.BooleanBuilder).AppendValues([]bool{true}, nil) + rec := bld.NewRecordBatch() + bld.Release() + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err = http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + var out hugrclient.IngestResult + require.NoError(t, json.NewDecoder(resp.Body).Decode(&out)) + resp.Body.Close() + assert.Equal(t, int64(1), out.Inserted) + + // --- Real-world bulk path ------------------------------------------------- + // A producer (ETL/CDC/telemetry) streams many RecordBatches in a single + // Arrow IPC stream over one HTTP POST. The whole payload is never + // materialised in memory client-side — we pipe the writer goroutine + // straight into the request body. This is where /ipc/ingest pays off vs. + // GraphQL `insert_events(data: ...)` mutations. + _, err = env.pgConn.ExecContext(context.Background(), "TRUNCATE TABLE events RESTART IDENTITY") + require.NoError(t, err) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + ) + + bulkSchema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + w := ipc.NewWriter(pw, ipc.WithSchema(bulkSchema)) + base := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + var streamErr error + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rb := array.NewRecordBuilder(pool, bulkSchema) + fields := eventsRecordBuildersFor(rb) + for i := 0; i < rowsPerBatch; i++ { + row := batchIdx*rowsPerBatch + i + fields.names.Append(fmt.Sprintf("evt-%06d", row)) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) + if row%5 == 0 { + fields.payloads.AppendNull() + } else { + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + fields.createdAt.Append(arrow.Timestamp(base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + batchRec := rb.NewRecordBatch() + rb.Release() + if werr := w.Write(batchRec); werr != nil { + streamErr = fmt.Errorf("write batch %d: %w", batchIdx, werr) + batchRec.Release() + break + } + batchRec.Release() + } + if cerr := w.Close(); cerr != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", cerr) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + start := time.Now() + bulkResp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", pr) + werr := <-writeErr + require.NoError(t, werr, "writer goroutine failed") + require.NoError(t, postErr) + require.Equal(t, http.StatusOK, bulkResp.StatusCode) + var bulkResult hugrclient.IngestResult + require.NoError(t, json.NewDecoder(bulkResp.Body).Decode(&bulkResult)) + bulkResp.Body.Close() + elapsed := time.Since(start) + assert.Equal(t, int64(totalRows), bulkResult.Inserted) + assert.ElementsMatch(t, []string{"name", "value", "is_active", "payload", "created_at"}, bulkResult.Columns) + + // Time the COUNT(*) right after the POST returns to prove the writes are + // synchronous: if the server reported "inserted" before the data was + // actually committed to Postgres, COUNT(*) would either lag or be partial. + countStart := time.Now() + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events").Scan(&count)) + countElapsed := time.Since(countStart) + assert.Equal(t, totalRows, count, "all rows must be visible the moment POST returns") + t.Logf("post-POST COUNT(*) visibility: %d rows in %s — no async lag", count, countElapsed) + + // Spot-check a sample to confirm per-row fidelity end-to-end. + rows, err := env.pgConn.Query(`SELECT name, value, is_active, payload IS NULL FROM events ORDER BY value LIMIT 5`) + require.NoError(t, err) + defer rows.Close() + var ( + sampleNames []string + sampleValues []float64 + sampleActive []bool + samplePayloadNull []bool + ) + for rows.Next() { + var n string + var v float64 + var a, pn bool + require.NoError(t, rows.Scan(&n, &v, &a, &pn)) + sampleNames = append(sampleNames, n) + sampleValues = append(sampleValues, v) + sampleActive = append(sampleActive, a) + samplePayloadNull = append(samplePayloadNull, pn) + } + require.NoError(t, rows.Err()) + assert.Equal(t, []string{"evt-000000", "evt-000001", "evt-000002", "evt-000003", "evt-000004"}, sampleNames) + assert.Equal(t, []float64{0, 0.5, 1.0, 1.5, 2.0}, sampleValues) + assert.Equal(t, []bool{true, false, true, false, true}, sampleActive) + // row%5 == 0 ⇒ payload IS NULL; in the first five rows that's just row 0. + assert.Equal(t, []bool{true, false, false, false, false}, samplePayloadNull) + + // Cross-check the active-row count to ensure the boolean column survived + // without bit-packing artefacts across batch boundaries. + var activeCount int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE is_active").Scan(&activeCount)) + assert.Equal(t, totalRows/2, activeCount) + + t.Logf("bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +func TestIngest_HTTP_GeometryTypes(t *testing.T) { + env := setupEnv(t) + + rec, schema := makeGeometryTypesRecord(t, []geometryTypesRow{ + {name: "geo-a", value: 1, active: true, point: xyPoint{x: 30.5, y: 50.25}, shapeOrigin: xyPoint{x: 0, y: 0}}, + {name: "geo-b", value: 2, active: true, point: xyPoint{x: -73.935242, y: 40.730610}, shapeOrigin: xyPoint{x: 1, y: 1}}, + }) + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err := http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + var out hugrclient.IngestResult + require.NoError(t, json.Unmarshal(body, &out)) + assert.Equal(t, int64(2), out.Inserted) + assert.ElementsMatch(t, geometryTypesColumns(), out.Columns) + + rows, err := env.pgConn.Query(fmt.Sprintf(` + SELECT name, + %s + FROM events + WHERE name LIKE 'geo-%%' + ORDER BY name + `, geometrySelectList(true))) + require.NoError(t, err) + defer rows.Close() + + got := map[string][]string{} + gotSRID := map[string][]int{} + for rows.Next() { + name, values, srids := scanNamedGeometryValuesWithSRID(t, rows) + got[name] = values + gotSRID[name] = srids + } + require.NoError(t, rows.Err()) + assert.Equal(t, map[string][]string{ + "geo-a": geometryExpected("POINT(30.5 50.25)", "0", "0"), + "geo-b": geometryExpected("POINT(-73.935242 40.73061)", "1", "1"), + }, got) + assert.Equal(t, map[string][]int{ + "geo-a": geometrySRIDExpected(), + "geo-b": geometrySRIDExpected(), + }, gotSRID) +} + +func TestIngest_HTTP_GeometryTypes_ReadThroughHugr(t *testing.T) { + env := setupEnv(t) + + rec, schema := makeGeometryTypesRecord(t, []geometryTypesRow{ + {name: "geo-read-a", value: 1, active: true, point: xyPoint{x: 30.5, y: 50.25}, shapeOrigin: xyPoint{x: 0, y: 0}}, + {name: "geo-read-b", value: 2, active: true, point: xyPoint{x: -73.935242, y: 40.730610}, shapeOrigin: xyPoint{x: 1, y: 1}}, + }) + defer rec.Release() + + var buf bytes.Buffer + w := ipc.NewWriter(&buf, ipc.WithSchema(schema)) + require.NoError(t, w.Write(rec)) + require.NoError(t, w.Close()) + + resp, err := http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", &buf) + require.NoError(t, err) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { like: "geo-read-%" } }`, []map[string]any{ + geometryReadExpected("geo-read-a", xyPoint{x: 30.5, y: 50.25}, 0, 0), + geometryReadExpected("geo-read-b", xyPoint{x: -73.935242, y: 40.730610}, 1, 1), + }) +} + +func TestIngest_HTTP_GeometryTypes_Bulk50k(t *testing.T) { + env := setupEnv(t) + + const ( + numBatches = 50 + rowsPerBatch = 1000 + totalRows = numBatches * rowsPerBatch + namePrefix = "pg-geo-bulk" + ) + schema := geometryTypesSchema() + pool := memory.NewGoAllocator() + + pr, pw := io.Pipe() + writeErr := make(chan error, 1) + go func() { + defer close(writeErr) + w := ipc.NewWriter(pw, ipc.WithSchema(schema)) + var streamErr error + for batchIdx := 0; batchIdx < numBatches; batchIdx++ { + rec := buildGeometryTypesBatch(t, pool, schema, batchIdx, rowsPerBatch, namePrefix) + if err := w.Write(rec); err != nil { + streamErr = fmt.Errorf("write geometry batch %d: %w", batchIdx, err) + rec.Release() + break + } + rec.Release() + } + if err := w.Close(); err != nil && streamErr == nil { + streamErr = fmt.Errorf("close arrow writer: %w", err) + } + _ = pw.CloseWithError(streamErr) + writeErr <- streamErr + }() + + start := time.Now() + resp, postErr := http.Post(env.server.URL+"/ipc/ingest?data_object=pg_ingest.events", + "application/vnd.apache.arrow.stream", pr) + werr := <-writeErr + require.NoError(t, werr, "writer goroutine failed") + require.NoError(t, postErr) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "body=%s", string(body)) + + var out hugrclient.IngestResult + require.NoError(t, json.Unmarshal(body, &out)) + assert.Equal(t, int64(totalRows), out.Inserted) + + var count int + require.NoError(t, env.pgConn.QueryRow("SELECT COUNT(*) FROM events WHERE name LIKE 'pg-geo-bulk-%'").Scan(&count)) + assert.Equal(t, totalRows, count) + + values, srids := scanGeometryValuesWithSRID(t, env.pgConn.QueryRow(fmt.Sprintf(` + SELECT %s + FROM events + WHERE name = 'pg-geo-bulk-049999' + `, geometrySelectList(true)))) + assert.Equal(t, geometryExpected("POINT(99 49)", "99", "49"), values) + assert.Equal(t, geometrySRIDExpected(), srids) + assertGeometryReadThroughHugr(t, env.service, env.dsName, `filter: { name: { eq: "pg-geo-bulk-049999" } }`, []map[string]any{ + geometryReadExpected("pg-geo-bulk-049999", xyPoint{x: 99, y: 49}, 99, 49), + }) + + elapsed := time.Since(start) + t.Logf("geometry bulk ingest: %d rows in %d batches via one /ipc/ingest POST in %s (%.0f rows/s)", + totalRows, numBatches, elapsed, float64(totalRows)/elapsed.Seconds()) +} + +// lazyEventsReader is an array.RecordReader that generates events-table +// RecordBatches on demand. This is the shape of a real-world Arrow producer +// (parquet scanner, CDC tap, kafka batcher) — the whole stream is never +// materialised in memory beyond the batch currently being consumed. +type lazyEventsReader struct { + pool memory.Allocator + schema *arrow.Schema + numBatches int + rowsPerBatch int + base time.Time + + batchIdx int + current arrow.RecordBatch + err error + refCount atomic.Int64 +} + +func newLazyEventsReader(pool memory.Allocator, numBatches, rowsPerBatch int, base time.Time) *lazyEventsReader { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + {Name: "value", Type: arrow.PrimitiveTypes.Float64, Nullable: false}, + {Name: "is_active", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "payload", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "created_at", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: true}, + }, nil) + r := &lazyEventsReader{ + pool: pool, + schema: schema, + numBatches: numBatches, + rowsPerBatch: rowsPerBatch, + base: base, + } + r.refCount.Add(1) + return r +} + +func (r *lazyEventsReader) Schema() *arrow.Schema { return r.schema } +func (r *lazyEventsReader) Err() error { return r.err } + +func (r *lazyEventsReader) Next() bool { + if r.current != nil { + r.current.Release() + r.current = nil + } + if r.batchIdx >= r.numBatches { + return false + } + rb := array.NewRecordBuilder(r.pool, r.schema) + defer rb.Release() + fields := eventsRecordBuildersFor(rb) + for i := 0; i < r.rowsPerBatch; i++ { + row := r.batchIdx*r.rowsPerBatch + i + fields.names.Append(fmt.Sprintf("evt-%06d", row)) + fields.values.Append(float64(row) * 0.5) + fields.active.Append(row%2 == 0) + if row%5 == 0 { + fields.payloads.AppendNull() + } else { + fields.payloads.Append(fmt.Sprintf(`{"row":%d}`, row)) + } + fields.createdAt.Append(arrow.Timestamp(r.base.Add(time.Duration(row) * time.Millisecond).UnixMicro())) + } + r.current = rb.NewRecordBatch() + r.batchIdx++ + return true +} + +func (r *lazyEventsReader) RecordBatch() arrow.RecordBatch { return r.current } +func (r *lazyEventsReader) Record() arrow.RecordBatch { return r.current } + +func (r *lazyEventsReader) Retain() { r.refCount.Add(1) } +func (r *lazyEventsReader) Release() { + if r.refCount.Add(-1) == 0 { + if r.current != nil { + r.current.Release() + r.current = nil + } + } +} diff --git a/integration-test/ingest-postgres/run.sh b/integration-test/ingest-postgres/run.sh new file mode 100755 index 00000000..a3c4a241 --- /dev/null +++ b/integration-test/ingest-postgres/run.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Run the /ipc/ingest integration tests against a fresh Postgres container. +# +# Usage: +# ./run.sh # bring up postgres, run tests, tear down +# ./run.sh keep # leave the container running after tests (for re-runs) +# +# The tests pick up the postgres DSN from INGEST_POSTGRES_DSN; if unset, this +# script populates it with the dockerized instance. + +set -euo pipefail + +HERE="$(cd "$(dirname "$0")" && pwd)" +COMPOSE_FILE="$HERE/docker-compose.yml" + +cleanup() { + if [[ "${1:-}" != "keep" ]]; then + docker compose -f "$COMPOSE_FILE" down -v + fi +} +trap 'cleanup "${1:-}"' EXIT + +docker compose -f "$COMPOSE_FILE" up -d --wait + +export INGEST_POSTGRES_DSN="postgres://test:test@127.0.0.1:5437/ingestdb" +export HUGR_INGEST_SCHEMAS_PATH="$HERE/testdata/schemas" + +cd "$HERE/../.." +go test -tags=duckdb_arrow -count=1 -v ./integration-test/ingest-postgres/... diff --git a/integration-test/ingest-postgres/testdata/init.sql b/integration-test/ingest-postgres/testdata/init.sql new file mode 100644 index 00000000..661f25fc --- /dev/null +++ b/integration-test/ingest-postgres/testdata/init.sql @@ -0,0 +1,68 @@ +-- Schema used by the /ipc/ingest integration tests. +-- A single events table with a mix of scalar types and an autogen primary key +-- so the tests can also exercise "default value" behaviour (omitting the PK +-- from the Arrow stream). + +CREATE EXTENSION IF NOT EXISTS postgis; + +CREATE TABLE events ( + id BIGSERIAL PRIMARY KEY, + name VARCHAR NOT NULL, + value DOUBLE PRECISION NOT NULL, + is_active BOOLEAN NOT NULL DEFAULT true, + owner_id BIGINT, + payload JSONB, + payload_large_string JSONB, + payload_string_view JSONB, + payload_binary JSONB, + payload_large_binary JSONB, + payload_binary_view JSONB, + payload_struct JSONB, + payload_list JSONB, + payload_large_list JSONB, + payload_fixed_size_list JSONB, + payload_list_view JSONB, + payload_large_list_view JSONB, + payload_map JSONB, + payload_scalar JSONB, + payload_arrow_json JSONB, + payload_geo_point JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + geom GEOMETRY(Point, 0), + geom_4326 GEOMETRY(Point, 4326), + geom_wkt GEOMETRY(LineString, 0), + geom_wkt_4326 GEOMETRY(LineString, 4326), + geom_geojson GEOMETRY(Polygon, 0), + geom_hugr_geojson GEOMETRY(Polygon, 0), + geom_plain_geojson GEOMETRY(Polygon, 0), + geom_geojson_struct GEOMETRY(Polygon, 0), + geom_geojson_arrow_json GEOMETRY(Polygon, 0), + geom_wkb GEOMETRY(Point, 0), + geom_hexwkb GEOMETRY(Point, 0), + geom_line GEOMETRY(LineString, 0), + geom_polygon_native GEOMETRY(Polygon, 0), + geom_multipoint GEOMETRY(MultiPoint, 0), + geom_multiline GEOMETRY(MultiLineString, 0), + geom_multipolygon GEOMETRY(MultiPolygon, 0) +); + +-- This table intentionally contains only binary-COPY-compatible PostgreSQL +-- types. The integration suite uses it to verify that duckdb-postgres selects +-- FORMAT BINARY rather than falling back to text because of a JSONB column. +CREATE TABLE binary_events ( + id BIGSERIAL PRIMARY KEY, + name VARCHAR NOT NULL, + value DOUBLE PRECISION NOT NULL, + geom GEOMETRY(Point, 0) +); + +-- Permissive geometry table for ingest edge-case coverage: NULL, 3D (Z), +-- EMPTY and GEOMETRYCOLLECTION values. The column is a bare `geometry` +-- (no type/SRID typmod) so it accepts whatever the native +-- DuckDB GEOMETRY -> PostGIS bridge produces, letting the test assert +-- whether the bridge preserves these non-trivial geometries faithfully. +CREATE TABLE geom_edge ( + id BIGSERIAL PRIMARY KEY, + name VARCHAR NOT NULL, + geom GEOMETRY +); diff --git a/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql new file mode 100644 index 00000000..43096fa6 --- /dev/null +++ b/integration-test/ingest-postgres/testdata/schemas/pg_ingest/schema.graphql @@ -0,0 +1,53 @@ +type events @table(name: "events") { + id: BigInt! @pk @default(sequence: "events_id_seq") + name: String! + value: Float! + is_active: Boolean! @default(value: "true") + owner_id: BigInt + payload: JSON + payload_large_string: JSON + payload_string_view: JSON + payload_binary: JSON + payload_large_binary: JSON + payload_binary_view: JSON + payload_struct: JSON + payload_list: JSON + payload_large_list: JSON + payload_fixed_size_list: JSON + payload_list_view: JSON + payload_large_list_view: JSON + payload_map: JSON + payload_scalar: JSON + payload_arrow_json: JSON + payload_geo_point: JSON + created_at: Timestamp @default(value: "now()") + geom: Geometry @geometry_info(srid: 0, type: POINT) + geom_4326: Geometry @geometry_info(srid: 4326, type: POINT) + geom_wkt: Geometry @geometry_info(srid: 0, type: LINESTRING) + geom_wkt_4326: Geometry @geometry_info(srid: 4326, type: LINESTRING) + geom_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_hugr_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_plain_geojson: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_geojson_struct: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_geojson_arrow_json: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_wkb: Geometry @geometry_info(srid: 0, type: POINT) + geom_hexwkb: Geometry @geometry_info(srid: 0, type: POINT) + geom_line: Geometry @geometry_info(srid: 0, type: LINESTRING) + geom_polygon_native: Geometry @geometry_info(srid: 0, type: POLYGON) + geom_multipoint: Geometry @geometry_info(srid: 0, type: MULTIPOINT) + geom_multiline: Geometry @geometry_info(srid: 0, type: MULTILINESTRING) + geom_multipolygon: Geometry @geometry_info(srid: 0, type: MULTIPOLYGON) +} + +type binary_events @table(name: "binary_events") { + id: BigInt! @pk @default(sequence: "binary_events_id_seq") + name: String! + value: Float! + geom: Geometry @geometry_info(srid: 0, type: POINT) +} + +type geom_edge @table(name: "geom_edge") { + id: BigInt! @pk @default(sequence: "geom_edge_id_seq") + name: String! + geom: Geometry @geometry_info(srid: 0) +} diff --git a/ipc-ingest.go b/ipc-ingest.go new file mode 100644 index 00000000..7799d26b --- /dev/null +++ b/ipc-ingest.go @@ -0,0 +1,127 @@ +package hugr + +import ( + "encoding/json" + "errors" + "fmt" + "net/http" + "strings" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/ipc" + "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/hugr-lab/query-engine/pkg/auth" + "github.com/hugr-lab/query-engine/pkg/db" + "github.com/hugr-lab/query-engine/pkg/perm" +) + +const ( + ingestContentType = "application/vnd.apache.arrow.stream" + ingestDataObjectArg = "data_object" +) + +// IngestResponse is the success payload returned by /ipc/ingest. +type IngestResponse struct { + DataObject string `json:"data_object"` + Inserted int64 `json:"inserted"` + Columns []string `json:"columns"` +} + +type ingestErrorBody struct { + Error string `json:"error"` +} + +// ipcIngestHandler accepts an Apache Arrow IPC stream in the request body and +// inserts it into a table data object. The planner resolves the target schema, +// validates insert inputs/permissions, casts Arrow values, and builds the +// INSERT FROM SELECT statement over a request-scoped Arrow view. +func (s *Service) ipcIngestHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + if r.Method != http.MethodPost { + writeIngestError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + dataObject := r.URL.Query().Get(ingestDataObjectArg) + if dataObject == "" { + writeIngestError(w, http.StatusBadRequest, "missing data_object query parameter") + return + } + + if ct := r.Header.Get("Content-Type"); ct != "" && !strings.HasPrefix(ct, ingestContentType) { + writeIngestError(w, http.StatusUnsupportedMediaType, + fmt.Sprintf("Content-Type must be %s, got %q", ingestContentType, ct)) + return + } + + ctx := r.Context() + if perm.PermissionsFromCtx(ctx) == nil { + newCtx, err := s.perm.ContextWithPermissions(ctx) + if err != nil { + if errors.Is(err, auth.ErrForbidden) { + writeIngestError(w, http.StatusForbidden, err.Error()) + return + } + writeIngestError(w, http.StatusInternalServerError, err.Error()) + return + } + ctx = newCtx + } + + reader, err := ipc.NewReader(r.Body, ipc.WithAllocator(memory.NewGoAllocator())) + if err != nil { + writeIngestError(w, http.StatusBadRequest, "invalid arrow stream: "+err.Error()) + return + } + defer reader.Release() + source := db.NewArrowIngestSource(reader) + + plan, err := s.planner.PlanArrowIngest(ctx, s.schema.Provider(), dataObject, source) + if err != nil { + if errors.Is(err, auth.ErrForbidden) { + writeIngestError(w, http.StatusForbidden, err.Error()) + return + } + writeIngestError(w, http.StatusBadRequest, err.Error()) + return + } + if err := plan.Compile(); err != nil { + writeIngestError(w, http.StatusBadRequest, err.Error()) + return + } + + if len(plan.Params) != 0 { + writeIngestError(w, http.StatusInternalServerError, "arrow ingest plan produced SQL parameters") + return + } + res, err := s.db.ExecArrowIngest(ctx, source, plan.CompiledQuery) + if err != nil { + writeIngestError(w, http.StatusInternalServerError, err.Error()) + return + } + inserted, _ := res.RowsAffected() + + out := IngestResponse{ + DataObject: dataObject, + Inserted: inserted, + Columns: ingestSchemaColumnNames(reader.Schema()), + } + _ = json.NewEncoder(w).Encode(out) +} + +func ingestSchemaColumnNames(schema *arrow.Schema) []string { + if schema == nil { + return nil + } + out := make([]string, 0, schema.NumFields()) + for _, f := range schema.Fields() { + out = append(out, f.Name) + } + return out +} + +func writeIngestError(w http.ResponseWriter, status int, msg string) { + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(ingestErrorBody{Error: msg}) +} diff --git a/pkg/catalog/compiler/base/options.go b/pkg/catalog/compiler/base/options.go index 8f007056..de6cb031 100644 --- a/pkg/catalog/compiler/base/options.go +++ b/pkg/catalog/compiler/base/options.go @@ -87,6 +87,7 @@ type EngineCapabilities struct { Insert EngineInsertCapabilities Update EngineUpdateCapabilities Delete EngineDeleteCapabilities + Ingest EngineIngestCapabilities } type EngineInsertCapabilities struct { @@ -95,6 +96,24 @@ type EngineInsertCapabilities struct { InsertReferences bool } +type EngineIngestCapabilities struct { + // Insert enables append-only INSERT ... SELECT ingest. + Insert bool + // Merge enables MERGE INTO ingest and requires Insert support. + Merge bool +} + +// Available reports whether the engine supports at least one ingest mode. +func (c EngineIngestCapabilities) Available() bool { + return c.Insert || c.Merge +} + +// Valid reports whether the ingest modes form a supported combination. +// Merge ingest builds on insert semantics and cannot be enabled on its own. +func (c EngineIngestCapabilities) Valid() bool { + return c.Insert || !c.Merge +} + type EngineUpdateCapabilities struct { Update bool UpdatePKColumns bool diff --git a/pkg/catalog/compiler/base/options_test.go b/pkg/catalog/compiler/base/options_test.go new file mode 100644 index 00000000..579bc5e6 --- /dev/null +++ b/pkg/catalog/compiler/base/options_test.go @@ -0,0 +1,28 @@ +package base + +import "testing" + +func TestEngineIngestCapabilities(t *testing.T) { + tests := []struct { + name string + caps EngineIngestCapabilities + available bool + valid bool + }{ + {name: "disabled", caps: EngineIngestCapabilities{}, available: false, valid: true}, + {name: "insert", caps: EngineIngestCapabilities{Insert: true}, available: true, valid: true}, + {name: "insert and merge", caps: EngineIngestCapabilities{Insert: true, Merge: true}, available: true, valid: true}, + {name: "merge without insert", caps: EngineIngestCapabilities{Merge: true}, available: true, valid: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.caps.Available(); got != tt.available { + t.Fatalf("Available() = %t, want %t", got, tt.available) + } + if got := tt.caps.Valid(); got != tt.valid { + t.Fatalf("Valid() = %t, want %t", got, tt.valid) + } + }) + } +} diff --git a/pkg/catalog/compiler/options.go b/pkg/catalog/compiler/options.go index e5e9f2ca..fd1235b6 100644 --- a/pkg/catalog/compiler/options.go +++ b/pkg/catalog/compiler/options.go @@ -11,6 +11,9 @@ type EngineCapabilities = base.EngineCapabilities // EngineInsertCapabilities is an alias for base.EngineInsertCapabilities. type EngineInsertCapabilities = base.EngineInsertCapabilities +// EngineIngestCapabilities is an alias for base.EngineIngestCapabilities. +type EngineIngestCapabilities = base.EngineIngestCapabilities + // EngineUpdateCapabilities is an alias for base.EngineUpdateCapabilities. type EngineUpdateCapabilities = base.EngineUpdateCapabilities diff --git a/pkg/db/arrow_ingest_source.go b/pkg/db/arrow_ingest_source.go new file mode 100644 index 00000000..f0cf1473 --- /dev/null +++ b/pkg/db/arrow_ingest_source.go @@ -0,0 +1,72 @@ +package db + +import ( + "fmt" + "strings" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/google/uuid" +) + +const arrowIngestViewNamePrefix = "_hugr_arrow_view_" + +// ArrowIngestSource binds an Arrow reader to the globally unique DuckDB view +// name used by both the planner and the ingest executor. +type ArrowIngestSource struct { + Reader array.RecordReader + viewName string +} + +func NewArrowIngestSource(reader array.RecordReader) ArrowIngestSource { + return ArrowIngestSource{ + Reader: reader, + viewName: arrowIngestViewNamePrefix + strings.ReplaceAll(uuid.NewString(), "-", ""), + } +} + +func (s ArrowIngestSource) View() string { + return s.viewName +} + +// NeedsSpatial reports whether the Arrow source carries geometry extension +// metadata that requires DuckDB's spatial extension before registering the view. +func (s ArrowIngestSource) NeedsSpatial() bool { + if s.Reader == nil || s.Reader.Schema() == nil { + return false + } + for _, f := range s.Reader.Schema().Fields() { + if extType, ok := f.Type.(arrow.ExtensionType); ok && isGeometryArrowExtension(extType.ExtensionName()) { + return true + } + if ext, ok := f.Metadata.GetValue("ARROW:extension:name"); ok && isGeometryArrowExtension(ext) { + return true + } + if ext, ok := f.Metadata.GetValue("extension:name"); ok && isGeometryArrowExtension(ext) { + return true + } + } + return false +} + +// RegisterView registers the source reader under the source view name. +func (s ArrowIngestSource) RegisterView(arrowConn interface { + RegisterView(reader array.RecordReader, viewName string) (func(), error) +}) (func(), error) { + if s.Reader == nil { + return nil, fmt.Errorf("missing arrow reader") + } + if s.View() == "" { + return nil, fmt.Errorf("missing arrow view name") + } + return arrowConn.RegisterView(s.Reader, s.View()) +} + +func isGeometryArrowExtension(ext string) bool { + ext = strings.ToLower(ext) + return strings.HasPrefix(ext, "geoarrow.") || + ext == "hugr.geojson" || + ext == "geojson" || + ext == "hugr.hexwkb" || + ext == "hexwkb" +} diff --git a/pkg/db/arrow_ingest_source_test.go b/pkg/db/arrow_ingest_source_test.go new file mode 100644 index 00000000..51e2b554 --- /dev/null +++ b/pkg/db/arrow_ingest_source_test.go @@ -0,0 +1,20 @@ +package db + +import ( + "strings" + "testing" +) + +func TestNewArrowIngestSourceUsesUniqueViewName(t *testing.T) { + first := NewArrowIngestSource(nil) + second := NewArrowIngestSource(nil) + + if first.View() == second.View() { + t.Fatalf("sources share view name %q", first.View()) + } + for _, name := range []string{first.View(), second.View()} { + if !strings.HasPrefix(name, arrowIngestViewNamePrefix) { + t.Fatalf("view name %q does not start with %q", name, arrowIngestViewNamePrefix) + } + } +} diff --git a/pkg/db/pool.go b/pkg/db/pool.go index d41c4523..227337ce 100644 --- a/pkg/db/pool.go +++ b/pkg/db/pool.go @@ -4,9 +4,11 @@ import ( "context" "database/sql" "database/sql/driver" + "errors" "fmt" "strings" "sync" + "time" "github.com/duckdb/duckdb-go/v2" ) @@ -221,6 +223,54 @@ func (p *Pool) Arrow(ctx context.Context) (*Arrow, error) { }, nil } +// ExecArrowIngest registers source.Reader as a globally named DuckDB view, +// executes query, then drops the view before releasing the Arrow stream. +func (p *Pool) ExecArrowIngest(ctx context.Context, source ArrowIngestSource, query string) (result sql.Result, err error) { + if source.Reader == nil { + return nil, fmt.Errorf("missing arrow reader") + } + if source.View() == "" { + return nil, fmt.Errorf("missing arrow view name") + } + ar, err := p.Arrow(ctx) + if err != nil { + return nil, err + } + defer ar.Close() + + execer, ok := ar.drv.(driver.ExecerContext) + if !ok { + return nil, fmt.Errorf("duckdb driver connection does not implement ExecerContext") + } + if source.NeedsSpatial() { + if _, err := execer.ExecContext(ctx, "LOAD spatial", nil); err != nil { + return nil, fmt.Errorf("prepare spatial arrow view: %w", err) + } + } + release, err := source.RegisterView(ar) + if err != nil { + return nil, fmt.Errorf("register arrow view: %w", err) + } + defer func() { + // The view created by duckdb_arrow_scan is global to the DuckDB + // database instance. Cleanup must outlive a canceled request context + // and must happen before the Arrow stream is released. + cleanupCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Second) + defer cancel() + _, cleanupErr := execer.ExecContext(cleanupCtx, "DROP VIEW IF EXISTS "+quoteIdentifier(source.View()), nil) + release() + if cleanupErr != nil { + err = errors.Join(err, fmt.Errorf("drop arrow ingest view %q: %w", source.View(), cleanupErr)) + } + }() + + return execer.ExecContext(ctx, query, nil) +} + +func quoteIdentifier(name string) string { + return `"` + strings.ReplaceAll(name, `"`, `""`) + `"` +} + func (p *Pool) RegisterScalarFunction(ctx context.Context, function ScalarFunction) error { return RegisterScalarFunction(ctx, p, function) } diff --git a/pkg/db/pool_test.go b/pkg/db/pool_test.go index c095422b..880abb5c 100644 --- a/pkg/db/pool_test.go +++ b/pkg/db/pool_test.go @@ -4,6 +4,10 @@ import ( "context" "sync" "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/memory" ) func TestNewPool(t *testing.T) { @@ -155,6 +159,59 @@ func TestPool_Arrow_Concurrent(t *testing.T) { wg.Wait() } +func TestPool_ExecArrowIngestDropsView(t *testing.T) { + pool, err := NewPool("") + if err != nil { + t.Fatal(err) + } + defer pool.Close() + + ctx := context.Background() + if _, err := pool.Exec(ctx, "CREATE TABLE ingest_target (value INTEGER)"); err != nil { + t.Fatal(err) + } + + schema := arrow.NewSchema([]arrow.Field{{Name: "value", Type: arrow.PrimitiveTypes.Int32}}, nil) + builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) + defer builder.Release() + builder.Field(0).(*array.Int32Builder).Append(42) + record := builder.NewRecordBatch() + defer record.Release() + reader, err := array.NewRecordReader(schema, []arrow.RecordBatch{record}) + if err != nil { + t.Fatal(err) + } + defer reader.Release() + + source := NewArrowIngestSource(reader) + query := "INSERT INTO ingest_target SELECT * FROM " + quoteIdentifier(source.View()) + if _, err := pool.ExecArrowIngest(ctx, source, query); err != nil { + t.Fatal(err) + } + + conn, err := pool.Conn(ctx) + if err != nil { + t.Fatal(err) + } + defer conn.Close() + + var value int + if err := conn.QueryRow(ctx, "SELECT value FROM ingest_target").Scan(&value); err != nil { + t.Fatal(err) + } + if value != 42 { + t.Fatalf("inserted value = %d, want 42", value) + } + + var views int + if err := conn.QueryRow(ctx, "SELECT count(*) FROM duckdb_views() WHERE view_name = ?", source.View()).Scan(&views); err != nil { + t.Fatal(err) + } + if views != 0 { + t.Fatalf("Arrow ingest view %q remains in the DuckDB catalog", source.View()) + } +} + func Test_print(t *testing.T) { t.Log("[{\"address\":\"ул. Мира - ул. Мичурина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.530669834,66.644954681]}\",\"id\":\"0381c536-6efc-49e8-b2ec-87303ba0d4f4\",\"isManaged\":false,\"name\":\"C-005\",\"number\":\"C005\"},{\"address\":\"Пермь, улица КИМ, 72\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[58.021034275,56.293535829]}\",\"id\":\"9cc382ea-5cc5-4b50-bb4f-11ea2de0a393\",\"isManaged\":false,\"name\":\"ДК КДУ-КМД\",\"number\":\"2001\"},{\"address\":\"ул. Мира- Чубынина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.530721108,66.613830328]}\",\"id\":\"2472a103-5e54-49f5-aa78-e690785ea83a\",\"isManaged\":false,\"name\":\"ИДК1-02\",\"number\":\"002\"},{\"address\":\"ул. Чубынина - Мира\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.531680341,66.612414122]}\",\"id\":\"6aa3eaa4-cfd9-431d-a45f-dc88820604aa\",\"isManaged\":false,\"name\":\"ИДК1-03\",\"number\":\"103\"},{\"address\":\" Арктическая-Губкина-Матросова\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.538271784,66.62610814]}\",\"id\":\"506b8ea2-615c-429d-bb29-bd0dc15e9996\",\"isManaged\":true,\"name\":\"ИДКЗ-01\",\"number\":\"301\"},{\"address\":\" Губкина - Зои Космодемьянской\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.537984428,66.633174419]}\",\"id\":\"841dd2cb-64a1-4d53-b76c-74869a4d01e6\",\"isManaged\":true,\"name\":\"ИДКЗ-02\",\"number\":\"302\"},{\"address\":\"Броднева -Губкина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.537782631,66.639154851]}\",\"id\":\"345ce8d4-65b4-46d3-92e5-b2a48876413e\",\"isManaged\":false,\"name\":\"ИДКЗ-03\",\"number\":\"303\"},{\"address\":\" Матросова - Подшибякина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.536092423,66.625503302]}\",\"id\":\"9f7591dd-aa75-4d29-8627-e8e9486986c9\",\"isManaged\":false,\"name\":\"ИДКЗ-04\",\"number\":\"304\"},{\"address\":\" Космодемьянской - Подшибякина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.535844447,66.632648706]}\",\"id\":\"bb0ad7e1-50e3-4982-96c7-70dc2ea54fb8\",\"isManaged\":false,\"name\":\"ИДКЗ-05\",\"number\":\"305\"},{\"address\":\"Броднева-Подшибякина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.535609528,66.638635397]}\",\"id\":\"eac5e822-ce2a-4425-8a27-47b1f88d9836\",\"isManaged\":false,\"name\":\"ИДКЗ-06\",\"number\":\"306\"},{\"address\":\"Ямальская - Матросова\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.534233819,66.625117064]}\",\"id\":\"af482037-3796-4d53-b59c-7e82dfeb454d\",\"isManaged\":false,\"name\":\"ИДКЗ-07\",\"number\":\"307\"},{\"address\":\"Ямальская - Космодемьянской\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.534003136,66.632198095]}\",\"id\":\"86952cf0-947d-4b9e-84c8-cfa44a3dd5a7\",\"isManaged\":false,\"name\":\"ИДКЗ-08\",\"number\":\"308\"},{\"address\":\"Ямальская- Броднева\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.533776997,66.638131142]}\",\"id\":\"498a3164-6e27-4908-b811-5a6b710be17d\",\"isManaged\":false,\"name\":\"ИДКЗ-09\",\"number\":\"309\"},{\"address\":\"улица Чубынина/улица Республики\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.529226826,66.616048263]}\",\"id\":\"d538df50-9193-4a1e-a02c-89f6fd4854cb\",\"isManaged\":false,\"name\":\"Инвиан 1\",\"number\":\"Инв-01\"},{\"address\":\"Пермь\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[58.019055099,56.290804077]}\",\"id\":\"962efde4-62e2-4128-a438-18f43f8b56e4\",\"isManaged\":true,\"name\":\"Инвиан-02\",\"number\":\"Инв-02\"},{\"address\":\"ул. Богдана Кнунянца\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.549909425,66.579353213]}\",\"id\":\"6d736df2-e46a-4698-9ac2-b02a99eaefd6\",\"isManaged\":false,\"name\":\"С-001\",\"number\":\"С001\"},{\"address\":\"ул. Почтовая - просп. Молодежи\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.538073574,66.596063375]}\",\"id\":\"7db4d259-75a7-4e68-a765-e6c0e91573e4\",\"isManaged\":false,\"name\":\"С-002\",\"number\":\"С002\"},{\"address\":\"ул. Объездная - просп. Молодежи\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.542511344,66.621533632]}\",\"id\":\"bafa44ff-e40d-460c-9f0c-3c3732833245\",\"isManaged\":false,\"name\":\"С-003\",\"number\":\"С003\"},{\"address\":\"ул. Республики - ул. Подшибякина\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.534745795,66.652078629]}\",\"id\":\"3fdb1c7f-d69e-40fd-a29b-316dc95faf65\",\"isManaged\":false,\"name\":\"С-004\",\"number\":\"С004\"},{\"address\":\"ул. Броднева\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.531127024,66.637487411]}\",\"id\":\"c4558a8e-4752-4131-b8a5-1bcf97c925e3\",\"isManaged\":false,\"name\":\"С-006\",\"number\":\"С006\"},{\"address\":\"ул. Артеева\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.531383389,66.631565094]}\",\"id\":\"62a25f70-b9c3-41a3-acc4-46344ab75e08\",\"isManaged\":false,\"name\":\"С-007\",\"number\":\"С007\"},{\"address\":\"ул. Мира\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[66.53162266,66.624526978]}\",\"id\":\"dc0e2a05-1df5-4854-ab12-78623105abae\",\"isManaged\":false,\"name\":\"С-008\",\"number\":\"С008\"},{\"address\":\"Пермь, улица КИМ, 74А\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[58.019861248,56.291919989]}\",\"id\":\"cea96a45-b726-4e9e-8c86-a507f020257b\",\"isManaged\":true,\"name\":\"Спектр2-01\",\"number\":\"С-01\"},{\"address\":\"Пермь\",\"coordinates\":\"{\\\"type\\\":\\\"Point\\\",\\\"coordinates\\\":[58.020333595,56.292587678]}\",\"id\":\"6cc55c55-87f6-4918-a7ac-46172539d73b\",\"isManaged\":true,\"name\":\"Спектр2-02\",\"number\":\"С-02\"}]") } diff --git a/pkg/engines/airport.go b/pkg/engines/airport.go index bf80f4b3..2f2617b0 100644 --- a/pkg/engines/airport.go +++ b/pkg/engines/airport.go @@ -26,6 +26,7 @@ func (e *AirportEngine) Type() Type { func (e *AirportEngine) Capabilities() *compiler.EngineCapabilities { cap := e.DuckDB.Capabilities() cap.General.SupportDefaultSequences = false + cap.Ingest = compiler.EngineIngestCapabilities{} return cap } diff --git a/pkg/engines/arrow_ingest.go b/pkg/engines/arrow_ingest.go new file mode 100644 index 00000000..083cbc87 --- /dev/null +++ b/pkg/engines/arrow_ingest.go @@ -0,0 +1,397 @@ +package engines + +import ( + "encoding/hex" + "fmt" + "strings" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" + ctypes "github.com/hugr-lab/query-engine/pkg/catalog/types" + "github.com/vektah/gqlparser/v2/ast" +) + +// ArrowIngestStagingBuilder owns every SQL expression evaluated by DuckDB while +// an Arrow reader is registered as a view. Target-specific conversion, when a +// target needs one, is applied separately through EngineIngestValueAdapter. +type ArrowIngestStagingBuilder struct { + duckdb DuckDB +} + +func NewArrowIngestStagingBuilder() *ArrowIngestStagingBuilder { + return &ArrowIngestStagingBuilder{} +} + +func (b *ArrowIngestStagingBuilder) SQLValue(v any) (string, error) { + return b.duckdb.SQLValue(v) +} + +func (b *ArrowIngestStagingBuilder) FunctionCall(name string, positional []any, named map[string]any) (string, error) { + return b.duckdb.FunctionCall(name, positional, named) +} + +// SelectExpr converts an Arrow-view column to its canonical DuckDB staging +// representation for the target GraphQL field. +func (b *ArrowIngestStagingBuilder) SelectExpr(field *ast.Field, arrowField arrow.Field, sourceExpr string) (string, error) { + if field == nil || field.Definition == nil { + return sourceExpr, nil + } + switch field.Definition.Type.Name() { + case base.JSONTypeName: + return arrowIngestJSONStagingExpr(arrowField, sourceExpr) + case base.GeometryTypeName: + return arrowIngestGeometryStagingExpr(arrowField, sourceExpr) + default: + return sourceExpr, nil + } +} + +// LiteralExpr converts a non-Arrow value, such as permission data, to a +// canonical DuckDB staging expression. +func (b *ArrowIngestStagingBuilder) LiteralExpr(field *ast.Field, value any) (string, error) { + if value == nil { + return "NULL", nil + } + if field != nil && field.Definition != nil && field.Definition.Type.Name() == base.GeometryTypeName { + geom, err := ctypes.ParseGeometryValue(value) + if err != nil { + return "", err + } + if geom == nil { + return "NULL", nil + } + wkbValue, err := ctypes.GeometryToSQLValue(geom) + if err != nil { + return "", err + } + return "ST_GeomFromWKB(from_hex('" + strings.ToUpper(hex.EncodeToString(wkbValue)) + "'))", nil + } + return b.duckdb.SQLValue(value) +} + +const ( + arrowJSONExtension = "arrow.json" + + hugrGeoJSONExtension = "hugr.geojson" + geoArrowGeoJSONExtension = "geoarrow.geojson" + plainGeoJSONExtension = "geojson" + hugrHexWKBExtension = "hugr.hexwkb" + geoArrowHexWKBExtension = "geoarrow.hexwkb" + plainHexWKBExtension = "hexwkb" + + geoArrowWKBExtension = "geoarrow.wkb" + geoArrowWKTExtension = "geoarrow.wkt" + geoArrowPointExtension = "geoarrow.point" + geoArrowLineStringExtension = "geoarrow.linestring" + geoArrowPolygonExtension = "geoarrow.polygon" + geoArrowMultiPointExtension = "geoarrow.multipoint" + geoArrowMultiLineStringExtension = "geoarrow.multilinestring" + geoArrowMultiPolygonExtension = "geoarrow.multipolygon" + geoArrowGeometryExtension = "geoarrow.geometry" + geoArrowGeometryCollectionExtension = "geoarrow.geometrycollection" +) + +func arrowIngestJSONStagingExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + ext := arrowExtensionNameFromTypeOrMetadata(arrowField) + switch { + case ext == "": + return jsonExprFromPlainArrow(arrowField, sourceExpr), nil + case ext == arrowJSONExtension: + return jsonExprFromArrowJSONExtension(arrowField, sourceExpr) + case isGeoJSONExtension(ext): + return jsonExprFromGeoJSONExtension(arrowField, sourceExpr) + case needsGeometryToJSON(ext): + geomExpr, err := geometryExprFromExtension(ext, arrowField, sourceExpr) + if err != nil { + return "", err + } + return jsonExprFromGeometryExpr(geomExpr), nil + default: + return "", fmt.Errorf("unsupported Arrow extension %q for JSON ingest", ext) + } +} + +func jsonExprFromPlainArrow(arrowField arrow.Field, sourceExpr string) string { + if expr, ok := jsonExprFromSerializedStorage(arrowField, sourceExpr); ok { + return expr + } + return duckDBToJSON(sourceExpr) +} + +func jsonExprFromArrowJSONExtension(arrowField arrow.Field, sourceExpr string) (string, error) { + if expr, ok := jsonExprFromSerializedStorage(arrowField, sourceExpr); ok { + return expr, nil + } + return "", storageError(arrowField, arrowJSONExtension) +} + +func jsonExprFromGeoJSONExtension(arrowField arrow.Field, sourceExpr string) (string, error) { + if expr, ok := jsonExprFromSerializedStorage(arrowField, sourceExpr); ok { + return expr, nil + } + if isArrowObjectStorage(arrowStorageTypeID(arrowField.Type)) { + return duckDBToJSON(sourceExpr), nil + } + return "", storageError(arrowField, "GeoJSON") +} + +func jsonExprFromSerializedStorage(arrowField arrow.Field, sourceExpr string) (string, bool) { + storage := arrowStorageTypeID(arrowField.Type) + switch { + case isArrowStringStorage(storage): + return "CAST(" + sourceExpr + " AS JSON)", true + case isArrowBinaryStorage(storage): + return "CAST(decode(" + sourceExpr + ") AS JSON)", true + default: + return "", false + } +} + +func jsonExprFromGeometryExpr(geometryExpr string) string { + return "CAST(ST_AsGeoJSON(" + geometryExpr + ") AS JSON)" +} + +func duckDBToJSON(sql string) string { + return "to_json(" + sql + ")" +} + +func arrowIngestGeometryStagingExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + ext := arrowExtensionNameFromTypeOrMetadata(arrowField) + if ext == "" { + return geometryExprFromPlainArrow(arrowField, sourceExpr) + } + return geometryExprFromExtension(ext, arrowField, sourceExpr) +} + +// geometryExprFromExtension uses GeoArrow/Hugr extension metadata as the source +// of truth. The physical Arrow storage type is only validated inside the +// selected extension handler; unsupported metadata never falls back to guessing. +func geometryExprFromExtension(ext string, arrowField arrow.Field, sourceExpr string) (string, error) { + switch { + case ext == geoArrowWKBExtension: + return geometryExprFromGeoArrowWKB(arrowField, sourceExpr) + case isHexWKBExtension(ext): + return geometryExprFromHexWKB(arrowField, sourceExpr) + case ext == geoArrowWKTExtension: + return geometryExprFromWKT(arrowField, sourceExpr) + case isGeoJSONExtension(ext): + return geometryExprFromGeoJSON(arrowField, sourceExpr) + case ext == arrowJSONExtension: + return geometryExprFromArrowJSON(arrowField, sourceExpr) + case isGeoArrowCoordinateExtension(ext): + return geometryExprFromGeoArrowCoordinates(ext, sourceExpr) + default: + return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) + } +} + +func geometryExprFromPlainArrow(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + switch { + case isArrowBinaryStorage(storage) || storage == arrow.FIXED_SIZE_BINARY: + return "ST_GeomFromWKB(" + sourceExpr + ")", nil + case isArrowStringStorage(storage): + return "ST_GeomFromText(" + sourceExpr + ", true)", nil + case isArrowObjectStorage(storage): + return "ST_GeomFromGeoJSON(" + duckDBJSONAsVarchar(sourceExpr) + ")", nil + default: + return "", fmt.Errorf("arrow column %q with type %s cannot be ingested as Geometry without geoarrow/hugr metadata", arrowField.Name, arrowField.Type) + } +} + +func geometryExprFromGeoArrowWKB(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + if isArrowBinaryStorage(storage) || storage == arrow.FIXED_SIZE_BINARY { + return sourceExpr, nil + } + return "", storageError(arrowField, geoArrowWKBExtension) +} + +func geometryExprFromHexWKB(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + if isArrowStringStorage(storage) { + return "ST_GeomFromWKB(from_hex(" + sourceExpr + "))", nil + } + return "", storageError(arrowField, "hexwkb") +} + +func geometryExprFromWKT(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + if isArrowStringStorage(storage) { + return "ST_GeomFromText(" + sourceExpr + ", true)", nil + } + return "", storageError(arrowField, geoArrowWKTExtension) +} + +func geometryExprFromGeoJSON(arrowField arrow.Field, sourceExpr string) (string, error) { + textExpr, err := geoJSONTextExpr(arrowField, sourceExpr) + if err != nil { + return "", err + } + return "ST_GeomFromGeoJSON(" + textExpr + ")", nil +} + +func geometryExprFromArrowJSON(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + if isArrowStringStorage(storage) { + return "ST_GeomFromGeoJSON(CAST(" + sourceExpr + " AS VARCHAR))", nil + } + return "", storageError(arrowField, arrowJSONExtension) +} + +func geoJSONTextExpr(arrowField arrow.Field, sourceExpr string) (string, error) { + storage := arrowStorageTypeID(arrowField.Type) + switch { + case isArrowStringStorage(storage): + return sourceExpr, nil + case isArrowBinaryStorage(storage): + return "CAST(decode(" + sourceExpr + ") AS VARCHAR)", nil + case isArrowObjectStorage(storage): + return duckDBJSONAsVarchar(sourceExpr), nil + default: + return "", storageError(arrowField, "GeoJSON") + } +} + +func duckDBJSONAsVarchar(sql string) string { + return duckDBToJSON(sql) + "::VARCHAR" +} + +func arrowExtensionNameFromTypeOrMetadata(field arrow.Field) string { + if extType, ok := field.Type.(arrow.ExtensionType); ok { + return strings.ToLower(extType.ExtensionName()) + } + if ext, ok := field.Metadata.GetValue("ARROW:extension:name"); ok { + return strings.ToLower(ext) + } + if ext, ok := field.Metadata.GetValue("extension:name"); ok { + return strings.ToLower(ext) + } + return "" +} + +func arrowStorageTypeID(dt arrow.DataType) arrow.Type { + if extType, ok := dt.(arrow.ExtensionType); ok { + return extType.StorageType().ID() + } + return dt.ID() +} + +func isArrowStringStorage(storage arrow.Type) bool { + switch storage { + case arrow.STRING, arrow.LARGE_STRING, arrow.STRING_VIEW: + return true + default: + return false + } +} + +func isArrowBinaryStorage(storage arrow.Type) bool { + switch storage { + case arrow.BINARY, arrow.LARGE_BINARY, arrow.BINARY_VIEW: + return true + default: + return false + } +} + +func isArrowObjectStorage(storage arrow.Type) bool { + switch storage { + case arrow.STRUCT, arrow.MAP: + return true + default: + return false + } +} + +func isGeoJSONExtension(ext string) bool { + switch ext { + case hugrGeoJSONExtension, geoArrowGeoJSONExtension, plainGeoJSONExtension: + return true + default: + return false + } +} + +func isHexWKBExtension(ext string) bool { + switch ext { + case hugrHexWKBExtension, geoArrowHexWKBExtension, plainHexWKBExtension: + return true + default: + return false + } +} + +func needsGeometryToJSON(ext string) bool { + return ext == geoArrowWKBExtension || + ext == geoArrowWKTExtension || + isHexWKBExtension(ext) || + isGeoArrowCoordinateExtension(ext) +} + +func isGeoArrowCoordinateExtension(ext string) bool { + switch ext { + case geoArrowPointExtension, + geoArrowLineStringExtension, + geoArrowPolygonExtension, + geoArrowMultiPointExtension, + geoArrowMultiLineStringExtension, + geoArrowMultiPolygonExtension, + geoArrowGeometryExtension, + geoArrowGeometryCollectionExtension: + return true + default: + return false + } +} + +func storageError(arrowField arrow.Field, format string) error { + return fmt.Errorf("arrow column %q with type %s cannot use %s storage", arrowField.Name, arrowField.Type, format) +} + +func geoArrowPointGeometryExpr(sql string) string { + return "ST_Point(struct_extract(" + sql + ", 'x'), struct_extract(" + sql + ", 'y'))" +} + +func geoArrowLineStringGeometryExpr(sql string) string { + return "ST_MakeLine(list_transform(" + sql + ", lambda _p: " + geoArrowPointGeometryExpr("_p") + "))" +} + +func geoArrowPolygonGeometryExpr(sql string) string { + shell := geoArrowLineStringGeometryExpr(sql + "[1]") + holes := "list_transform(" + sql + "[2:], lambda _r: " + geoArrowLineStringGeometryExpr("_r") + ")" + return "ST_MakePolygon(" + shell + ", " + holes + ")" +} + +func geoArrowMultiPointGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _p: " + geoArrowPointGeometryExpr("_p") + ")))" +} + +func geoArrowMultiLineStringGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _ls: " + geoArrowLineStringGeometryExpr("_ls") + ")))" +} + +func geoArrowMultiPolygonGeometryExpr(sql string) string { + return "ST_Multi(ST_Collect(list_transform(" + sql + ", lambda _poly: " + geoArrowPolygonGeometryExpr("_poly") + ")))" +} + +func geometryExprFromGeoArrowCoordinates(ext, sql string) (string, error) { + switch ext { + case geoArrowPointExtension: + return geoArrowPointGeometryExpr(sql), nil + case geoArrowLineStringExtension: + return geoArrowLineStringGeometryExpr(sql), nil + case geoArrowPolygonExtension: + return geoArrowPolygonGeometryExpr(sql), nil + case geoArrowMultiPointExtension: + return geoArrowMultiPointGeometryExpr(sql), nil + case geoArrowMultiLineStringExtension: + return geoArrowMultiLineStringGeometryExpr(sql), nil + case geoArrowMultiPolygonExtension: + return geoArrowMultiPolygonGeometryExpr(sql), nil + case geoArrowGeometryExtension, geoArrowGeometryCollectionExtension: + return "", fmt.Errorf("%s ingest is not supported from native union storage; send geoarrow.wkb, geoarrow.wkt, geoarrow.geojson, or a concrete GeoArrow coordinate layout", ext) + default: + return "", fmt.Errorf("unsupported GeoArrow extension %q", ext) + } +} diff --git a/pkg/engines/arrow_ingest_test.go b/pkg/engines/arrow_ingest_test.go new file mode 100644 index 00000000..1e3160ef --- /dev/null +++ b/pkg/engines/arrow_ingest_test.go @@ -0,0 +1,334 @@ +package engines + +import ( + "strings" + "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/extensions" + "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" + "github.com/paulmach/orb" + "github.com/vektah/gqlparser/v2/ast" +) + +func TestArrowIngestJSONStagingExpr(t *testing.T) { + tests := []struct { + name string + typ arrow.DataType + ext string + want string + }{ + {name: "string", typ: arrow.BinaryTypes.String, want: "CAST(payload AS JSON)"}, + {name: "large string", typ: arrow.BinaryTypes.LargeString, want: "CAST(payload AS JSON)"}, + {name: "string view", typ: arrow.BinaryTypes.StringView, want: "CAST(payload AS JSON)"}, + {name: "binary", typ: arrow.BinaryTypes.Binary, want: "CAST(decode(payload) AS JSON)"}, + {name: "large binary", typ: arrow.BinaryTypes.LargeBinary, want: "CAST(decode(payload) AS JSON)"}, + {name: "binary view", typ: arrow.BinaryTypes.BinaryView, want: "CAST(decode(payload) AS JSON)"}, + {name: "struct", typ: arrow.StructOf(), want: "to_json(payload)"}, + {name: "list", typ: arrow.ListOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "large list", typ: arrow.LargeListOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "fixed size list", typ: arrow.FixedSizeListOf(2, arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "list view", typ: arrow.ListViewOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "large list view", typ: arrow.LargeListViewOf(arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "map", typ: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), want: "to_json(payload)"}, + {name: "scalar", typ: arrow.PrimitiveTypes.Int64, want: "to_json(payload)"}, + {name: "arrow json extension", typ: mustTestArrowJSONType(t), want: "CAST(payload AS JSON)"}, + {name: "geojson string extension", typ: arrow.BinaryTypes.String, ext: "geoarrow.geojson", want: "CAST(payload AS JSON)"}, + {name: "geojson struct extension", typ: arrow.StructOf(arrow.Field{Name: "type", Type: arrow.BinaryTypes.String}), ext: "geoarrow.geojson", want: "to_json(payload)"}, + {name: "geo wkt extension", typ: arrow.BinaryTypes.String, ext: "geoarrow.wkt", want: "CAST(ST_AsGeoJSON(ST_GeomFromText(payload, true)) AS JSON)"}, + {name: "geo hex wkb extension", typ: arrow.BinaryTypes.String, ext: "hugr.hexwkb", want: "CAST(ST_AsGeoJSON(ST_GeomFromWKB(from_hex(payload))) AS JSON)"}, + {name: "native geoarrow point extension", typ: geoArrowTestType("geoarrow.point"), ext: "geoarrow.point", want: "CAST(ST_AsGeoJSON(ST_Point(struct_extract(payload, 'x'), struct_extract(payload, 'y'))) AS JSON)"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + meta := arrow.Metadata{} + if tt.ext != "" { + meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) + } + got, err := arrowIngestJSONStagingExpr(arrow.Field{Name: "payload", Type: tt.typ, Metadata: meta}, "payload") + if err != nil { + t.Fatal(err) + } + if got != tt.want { + t.Fatalf("got %q, want %q", got, tt.want) + } + }) + } +} + +func TestArrowIngestJSONRejectsUnsupportedExtensionMetadata(t *testing.T) { + _, err := arrowIngestJSONStagingExpr(arrow.Field{ + Name: "payload", + Type: arrow.BinaryTypes.String, + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": "hugr.unknown_json"}), + }, "payload") + if err == nil { + t.Fatal("expected unsupported JSON extension to be rejected") + } + if !strings.Contains(err.Error(), `unsupported Arrow extension "hugr.unknown_json" for JSON ingest`) { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestArrowIngestStagingBuildsNativeGeoArrowSelectExpr(t *testing.T) { + field := geometryTestField("") + staging := NewArrowIngestStagingBuilder() + + tests := []struct { + ext string + want string + }{ + {"geoarrow.point", "ST_Point(struct_extract(geom, 'x'), struct_extract(geom, 'y'))"}, + {"geoarrow.linestring", "ST_MakeLine(list_transform(geom"}, + {"geoarrow.polygon", "ST_MakePolygon(ST_MakeLine(list_transform(geom[1]"}, + {"geoarrow.multipoint", "ST_Multi(ST_Collect(list_transform(geom"}, + {"geoarrow.multilinestring", "ST_Multi(ST_Collect(list_transform(geom"}, + {"geoarrow.multipolygon", "ST_Multi(ST_Collect(list_transform(geom"}, + } + + for _, tt := range tests { + t.Run(tt.ext, func(t *testing.T) { + got, err := staging.SelectExpr(field, arrow.Field{ + Name: "geom", + Type: geoArrowTestType(tt.ext), + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), + }, "geom") + if err != nil { + t.Fatal(err) + } + if got == "geom" { + t.Fatalf("expected explicit conversion, got raw column") + } + if !strings.Contains(got, tt.want) || + strings.Contains(got, "ST_GeomFromText(") || + strings.Contains(got, "ST_AsText(") { + t.Fatalf("unexpected conversion for %s: %s", tt.ext, got) + } + }) + } +} + +func TestArrowIngestStagingBuildsDirectGeometrySelectExpr(t *testing.T) { + field := geometryTestField("") + staging := NewArrowIngestStagingBuilder() + + tests := []struct { + name string + typ arrow.DataType + ext string + want string + }{ + { + name: "trusted geoarrow wkb is already materialized as geometry", + typ: arrow.BinaryTypes.Binary, + ext: "geoarrow.wkb", + want: "geom", + }, + { + name: "trusted geoarrow wkt parses directly from text", + typ: arrow.BinaryTypes.String, + ext: "geoarrow.wkt", + want: "ST_GeomFromText(geom, true)", + }, + { + name: "trusted geoarrow geojson parses directly from json", + typ: arrow.BinaryTypes.String, + ext: "geoarrow.geojson", + want: "ST_GeomFromGeoJSON(geom)", + }, + { + name: "trusted hugr geojson parses directly from json", + typ: arrow.BinaryTypes.String, + ext: "hugr.geojson", + want: "ST_GeomFromGeoJSON(geom)", + }, + { + name: "trusted plain geojson parses directly from json", + typ: arrow.BinaryTypes.String, + ext: "geojson", + want: "ST_GeomFromGeoJSON(geom)", + }, + { + name: "trusted geojson struct serializes to json text", + typ: arrow.StructOf(arrow.Field{Name: "type", Type: arrow.BinaryTypes.String}), + ext: "geoarrow.geojson", + want: "ST_GeomFromGeoJSON(to_json(geom)::VARCHAR)", + }, + { + name: "unannotated struct serializes to geojson text", + typ: arrow.StructOf(arrow.Field{Name: "type", Type: arrow.BinaryTypes.String}), + want: "ST_GeomFromGeoJSON(to_json(geom)::VARCHAR)", + }, + { + name: "arrow json parses as geojson text", + typ: mustTestArrowJSONType(t), + ext: "arrow.json", + want: "ST_GeomFromGeoJSON(CAST(geom AS VARCHAR))", + }, + { + name: "trusted hex wkb parses through from_hex", + typ: arrow.BinaryTypes.String, + ext: "hugr.hexwkb", + want: "ST_GeomFromWKB(from_hex(geom))", + }, + { + name: "unannotated binary parses directly as wkb", + typ: arrow.BinaryTypes.Binary, + want: "ST_GeomFromWKB(geom)", + }, + { + name: "unannotated string parses as wkt", + typ: arrow.BinaryTypes.String, + want: "ST_GeomFromText(geom, true)", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + meta := arrow.Metadata{} + if tt.ext != "" { + meta = arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}) + } + got, err := staging.SelectExpr(field, arrow.Field{ + Name: "geom", + Type: tt.typ, + Metadata: meta, + }, "geom") + if err != nil { + t.Fatal(err) + } + if got != tt.want { + t.Fatalf("expected %s, got %s", tt.want, got) + } + if strings.Contains(got, "ST_AsText") { + t.Fatalf("expected direct geometry expression without ST_AsText, got %s", got) + } + }) + } +} + +func mustTestArrowJSONType(t *testing.T) arrow.DataType { + t.Helper() + typ, err := extensions.NewJSONType(arrow.BinaryTypes.String) + if err != nil { + t.Fatal(err) + } + return typ +} + +func TestArrowIngestRejectsNativeGeoArrowUnionLayouts(t *testing.T) { + field := geometryTestField("") + staging := NewArrowIngestStagingBuilder() + for _, ext := range []string{"geoarrow.geometry", "geoarrow.geometrycollection"} { + t.Run(ext, func(t *testing.T) { + _, err := staging.SelectExpr(field, arrow.Field{ + Name: "geom", + Type: arrow.StructOf(), + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": ext}), + }, "geom") + if err == nil { + t.Fatalf("expected %s to be rejected", ext) + } + }) + } +} + +func TestArrowIngestRejectsUnsupportedGeometryExtensionMetadata(t *testing.T) { + field := geometryTestField("") + staging := NewArrowIngestStagingBuilder() + for _, tt := range []struct { + name string + typ arrow.DataType + ext string + }{ + { + name: "string-like column does not fall back to WKT when metadata is unsupported", + typ: arrow.BinaryTypes.String, + ext: "geoarrow.curve", + }, + { + name: "binary-like column does not fall back to WKB when metadata is unsupported", + typ: arrow.BinaryTypes.Binary, + ext: "hugr.unknown_geometry", + }, + } { + t.Run(tt.name, func(t *testing.T) { + _, err := staging.SelectExpr(field, arrow.Field{ + Name: "geom", + Type: tt.typ, + Metadata: arrow.MetadataFrom(map[string]string{"ARROW:extension:name": tt.ext}), + }, "geom") + if err == nil { + t.Fatalf("expected unsupported extension %q to be rejected", tt.ext) + } + if !strings.Contains(err.Error(), "unsupported GeoArrow extension") { + t.Fatalf("unexpected error for %q: %v", tt.ext, err) + } + }) + } +} + +func TestArrowIngestStagingLiteralExpr(t *testing.T) { + staging := NewArrowIngestStagingBuilder() + + jsonSQL, err := staging.LiteralExpr(nil, map[string]any{"status": "ok"}) + if err != nil { + t.Fatal(err) + } + if strings.Contains(jsonSQL, "JSONB") || !strings.Contains(jsonSQL, "::JSON") { + t.Fatalf("expected DuckDB JSON literal, got %s", jsonSQL) + } + + geomSQL, err := staging.LiteralExpr(geometryTestField("4326"), orb.Point{1, 2}) + if err != nil { + t.Fatal(err) + } + if !strings.Contains(geomSQL, "ST_GeomFromWKB(from_hex('0101000000") || + strings.Contains(geomSQL, "'SRID=4326;'") || + strings.Contains(geomSQL, "ST_GeomFromText") || + strings.Contains(geomSQL, "POINT") { + t.Fatalf("expected canonical WKB geometry literal, got %s", geomSQL) + } +} + +func geometryTestField(srid string) *ast.Field { + def := &ast.FieldDefinition{ + Name: "geom", + Type: ast.NamedType(base.GeometryTypeName, nil), + } + if srid != "" { + def.Directives = ast.DirectiveList{ + &ast.Directive{ + Name: base.FieldGeometryInfoDirectiveName, + Arguments: ast.ArgumentList{ + &ast.Argument{Name: base.ArgSRID, Value: &ast.Value{Raw: srid}}, + }, + }, + } + } + return &ast.Field{ + Name: "geom", + Alias: "geom", + Definition: def, + } +} + +func geoArrowTestType(ext string) arrow.DataType { + point := arrow.StructOf( + arrow.Field{Name: "x", Type: arrow.PrimitiveTypes.Float64}, + arrow.Field{Name: "y", Type: arrow.PrimitiveTypes.Float64}, + ) + switch ext { + case "geoarrow.point": + return point + case "geoarrow.linestring", "geoarrow.multipoint": + return arrow.ListOf(point) + case "geoarrow.polygon", "geoarrow.multilinestring": + return arrow.ListOf(arrow.ListOf(point)) + case "geoarrow.multipolygon": + return arrow.ListOf(arrow.ListOf(arrow.ListOf(point))) + default: + return point + } +} diff --git a/pkg/engines/duckdb.go b/pkg/engines/duckdb.go index 4b1ef5a4..7722833a 100644 --- a/pkg/engines/duckdb.go +++ b/pkg/engines/duckdb.go @@ -71,6 +71,9 @@ func (e *DuckDB) Capabilities() *compiler.EngineCapabilities { Returning: true, InsertReferences: true, }, + Ingest: compiler.EngineIngestCapabilities{ + Insert: true, + }, Update: compiler.EngineUpdateCapabilities{ Update: true, UpdatePKColumns: true, diff --git a/pkg/engines/ducklake.go b/pkg/engines/ducklake.go index fed1902e..2c340073 100644 --- a/pkg/engines/ducklake.go +++ b/pkg/engines/ducklake.go @@ -31,6 +31,7 @@ func (e *DuckLake) Capabilities() *compiler.EngineCapabilities { dbCaps := e.duckdb.Capabilities() caps := *dbCaps // defensive copy caps.General.SupportTimeTravel = true + caps.Ingest = compiler.EngineIngestCapabilities{} return &caps } diff --git a/pkg/engines/engines.go b/pkg/engines/engines.go index 25f91f8e..aa47f72a 100644 --- a/pkg/engines/engines.go +++ b/pkg/engines/engines.go @@ -72,6 +72,17 @@ type EngineTypeCaster interface { CastFromIntermediateType(field *ast.Field, toJSON bool) (string, error) } +// EngineIngestValueAdapter is implemented by engines whose ingest path cannot +// consume canonical DuckDB staging values directly. Engines that do not +// implement it explicitly accept canonical staging values as their ingest +// contract. +type EngineIngestValueAdapter interface { + Engine + // AdaptIngestValueSQL adapts a DuckDB staging value SQL fragment to the + // representation expected by this engine/source during batch ingest. + AdaptIngestValueSQL(field *ast.Field, valueSQL string) (string, error) +} + type EngineVectorDistanceCalculator interface { VectorDistanceSQL(sql, distMetric string, vector types.Vector, params []any) (string, []any, error) } diff --git a/pkg/engines/iceberg.go b/pkg/engines/iceberg.go index 6cfe49cc..4bac4666 100644 --- a/pkg/engines/iceberg.go +++ b/pkg/engines/iceberg.go @@ -33,6 +33,7 @@ func (e *Iceberg) Capabilities() *compiler.EngineCapabilities { caps.General.SupportTimeTravel = true // DuckDB Iceberg extension doesn't support targeted inserts (INSERT INTO tbl(col1,col2) VALUES ...) caps.Insert.Insert = false + caps.Ingest = compiler.EngineIngestCapabilities{} caps.Insert.Returning = false caps.Insert.InsertReferences = false return &caps diff --git a/pkg/engines/postgres.go b/pkg/engines/postgres.go index 970d1bb6..099781e8 100644 --- a/pkg/engines/postgres.go +++ b/pkg/engines/postgres.go @@ -47,6 +47,9 @@ func (e *Postgres) Capabilities() *compiler.EngineCapabilities { Returning: true, InsertReferences: true, }, + Ingest: compiler.EngineIngestCapabilities{ + Insert: true, + }, Update: compiler.EngineUpdateCapabilities{ Update: true, UpdatePKColumns: true, diff --git a/pkg/perm/permissions.go b/pkg/perm/permissions.go index a3dd0624..d11d8e77 100644 --- a/pkg/perm/permissions.go +++ b/pkg/perm/permissions.go @@ -8,9 +8,9 @@ import ( "github.com/vektah/gqlparser/v2/ast" "github.com/hugr-lab/query-engine/pkg/auth" - "github.com/hugr-lab/query-engine/pkg/engines" "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" + "github.com/hugr-lab/query-engine/pkg/engines" ) type RolePermissions struct { @@ -54,7 +54,6 @@ func (r *RolePermissions) CheckQuery(query *ast.Field) error { return nil } - func (r *RolePermissions) CheckMutationInput(ctx context.Context, defs base.DefinitionsSource, inputName string, data map[string]any) error { if r.Disabled { return auth.ErrForbidden @@ -153,11 +152,15 @@ func applyContextVariable(ctx context.Context, data map[string]any, vars map[str v[i] = applyContextVariable(ctx, vv, vars) } } + res[k] = v case string: if val, ok := vars[v]; ok { res[k] = val continue } + res[k] = v + default: + res[k] = v } } diff --git a/pkg/planner/node_arrow_ingest.go b/pkg/planner/node_arrow_ingest.go new file mode 100644 index 00000000..ec381c6b --- /dev/null +++ b/pkg/planner/node_arrow_ingest.go @@ -0,0 +1,427 @@ +package planner + +import ( + "context" + "fmt" + "strings" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/hugr-lab/query-engine/pkg/auth" + "github.com/hugr-lab/query-engine/pkg/catalog" + "github.com/hugr-lab/query-engine/pkg/catalog/compiler/base" + "github.com/hugr-lab/query-engine/pkg/catalog/sdl" + "github.com/hugr-lab/query-engine/pkg/db" + "github.com/hugr-lab/query-engine/pkg/engines" + "github.com/hugr-lab/query-engine/pkg/perm" + "github.com/vektah/gqlparser/v2/ast" +) + +type ingestColumn struct { + ArrowField arrow.Field + Field *sdl.Field + FieldDef *ast.FieldDefinition + InputDef *ast.FieldDefinition +} + +func ingestRootNode(ctx context.Context, provider catalog.Provider, planner Catalog, dataObject string, source db.ArrowIngestSource) (*QueryPlanNode, error) { + if dataObject == "" { + return nil, fmt.Errorf("missing data object") + } + if source.Reader == nil { + return nil, fmt.Errorf("missing arrow reader") + } + if source.View() == "" { + return nil, fmt.Errorf("missing arrow view name") + } + + info, mutationField, err := resolveIngestTarget(ctx, provider, dataObject) + if err != nil { + return nil, err + } + engine, err := planner.Engine(info.Catalog) + if err != nil { + return nil, err + } + caps := engine.Capabilities() + if caps == nil { + return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) + } + if !caps.Ingest.Valid() { + return nil, fmt.Errorf("engine %q has invalid ingest capabilities: merge requires insert", engine.Type()) + } + if !caps.Ingest.Available() { + return nil, fmt.Errorf("engine %q does not support IPC ingest", engine.Type()) + } + mutation := sdl.MutationInfo(ctx, provider, mutationField) + if mutation == nil || mutation.Type != sdl.MutationTypeInsert { + return nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) + } + + permissionData, err := resolveIngestPermissionData(ctx, provider, info, mutationField) + if err != nil { + return nil, err + } + columns, err := resolveIngestColumns(ctx, provider, info, mutation, source.Reader.Schema(), permissionData) + if err != nil { + return nil, err + } + if len(columns) == 0 { + return nil, fmt.Errorf("no insertable columns matched between arrow stream and data object") + } + if err := checkIngestPermissions(ctx, provider, info, columns, permissionData); err != nil { + return nil, err + } + return ingestNode(ctx, info, mutation, engine, columns, permissionData, source.View()), nil +} + +func resolveIngestTarget(ctx context.Context, provider catalog.Provider, dataObject string) (*sdl.Object, *ast.FieldDefinition, error) { + var def *ast.Definition + if strings.Contains(dataObject, ".") { + queryDef := provider.ForName(ctx, base.QueryBaseName) + if queryDef == nil { + return nil, nil, fmt.Errorf("query base type not found in schema") + } + cur := queryDef + for _, part := range strings.Split(dataObject, ".") { + f := cur.Fields.ForName(part) + if f == nil { + return nil, nil, fmt.Errorf("data object %q: segment %q not found", dataObject, part) + } + cur = provider.ForName(ctx, f.Type.Name()) + if cur == nil { + return nil, nil, fmt.Errorf("data object %q: type %q not found", dataObject, f.Type.Name()) + } + } + def = cur + } else { + def = provider.ForName(ctx, dataObject) + } + if def == nil { + return nil, nil, fmt.Errorf("data object %q not found in schema", dataObject) + } + if !sdl.IsDataObject(def) { + return nil, nil, fmt.Errorf("%q is not a data object", dataObject) + } + info := sdl.DataObjectInfo(def) + if info == nil { + return nil, nil, fmt.Errorf("data object %q: no info", dataObject) + } + if info.Type != sdl.TableDataObject { + return nil, nil, fmt.Errorf("data object %q is not a table (got %q): only tables are ingestable", dataObject, info.Type) + } + if info.Catalog == "" { + return nil, nil, fmt.Errorf("data object %q has no catalog", dataObject) + } + _, mutationField := sdl.ObjectMutationDefinition(ctx, provider, def, sdl.MutationTypeInsert) + if mutationField == nil { + return nil, nil, fmt.Errorf("data object %q has no insert mutation defined", dataObject) + } + return info, mutationField, nil +} + +// resolveIngestColumns matches Arrow IPC stream columns to the GraphQL insert +// contract for the target table data object. +// +// - provider resolves GraphQL definitions generated by the catalog compiler. +// - info describes the GraphQL data object and its DB table/field mapping. +// - mutation describes the GraphQL insert mutation and insertable fields. +// - schema is the Arrow IPC schema supplied by the request body. +// - permissionData contains fields injected by permissions; required fields +// can be satisfied by either Arrow columns or these injected values. +// +// The returned ingestColumn values keep all three names/spaces together: +// Arrow field, GraphQL object/input field, and DB table column mapping. +func resolveIngestColumns(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutation *sdl.Mutation, schema *arrow.Schema, permissionData map[string]any) ([]ingestColumn, error) { + if schema == nil { + return nil, fmt.Errorf("arrow stream has no schema") + } + // GraphQL input type accepted by the insert mutation, for example + // events_mut_input_data. Arrow columns must be valid fields of this input. + inputName := info.InputInsertDataName() + if inputName == "" { + return nil, fmt.Errorf("data object %q has no insert input type", info.Name) + } + // GraphQL definition of the insert input object. This is not the DB table; + // it is the public mutation contract used for permission/input validation. + input := provider.ForName(ctx, inputName) + if input == nil { + return nil, fmt.Errorf("insert input type %q not found", inputName) + } + + // seen detects duplicate Arrow column names before they collapse into the + // byName lookup map below. + seen := map[string]struct{}{} + // byName stores the resolved ingest metadata keyed by GraphQL/Arrow field + // name. Field source directives may later map this name to another DB column. + byName := make(map[string]ingestColumn, schema.NumFields()) + for _, af := range schema.Fields() { + // af is the physical Arrow IPC column. Its name is matched against the + // GraphQL insert input and data object field names. + if _, dup := seen[af.Name]; dup { + return nil, fmt.Errorf("duplicate arrow column %q", af.Name) + } + seen[af.Name] = struct{}{} + + // inputField is the GraphQL mutation input field. If it is absent, the + // client is trying to ingest a column that the insert API does not expose. + inputField := input.Fields.ForName(af.Name) + if inputField == nil { + return nil, fmt.Errorf("column %q is not defined in insert input %q", af.Name, inputName) + } + // objectField is the GraphQL field on the table data object. It carries + // type information and directives such as @geometry_info / @field_source. + objectField := info.Definition().Fields.ForName(af.Name) + if objectField == nil { + return nil, fmt.Errorf("column %q is not defined in data object %q", af.Name, info.Definition().Name) + } + // fieldInfo is the compiled catalog view of objectField. It knows whether + // the GraphQL field is a real DB field, a reference, or a computed field, + // and how it maps to the table column. + fieldInfo := info.FieldForName(af.Name) + if fieldInfo == nil { + return nil, fmt.Errorf("column %q is not defined in data object %q", af.Name, info.Definition().Name) + } + if fieldInfo.IsReferencesSubquery() { + return nil, fmt.Errorf("column %q is a reference and cannot be ingested directly", af.Name) + } + if fieldInfo.IsNotDBField() { + return nil, fmt.Errorf("column %q is a computed/virtual field and cannot be ingested", af.Name) + } + if fieldInfo.FieldSourceName("", false) == "-" { + return nil, fmt.Errorf("column %q has no database mapping", af.Name) + } + byName[af.Name] = ingestColumn{ + ArrowField: af, + Field: fieldInfo, + FieldDef: objectField, + InputDef: inputField, + } + } + + // Check required GraphQL insert fields that were not supplied by Arrow. + // Required fields are allowed to be omitted only when the DB/catalog can + // provide them through a sequence, default insert expression, or @default. + for _, fieldInfo := range mutation.Fields() { + if _, ok := byName[fieldInfo.Name]; ok { + continue + } + if _, ok := permissionData[fieldInfo.Name]; ok { + continue + } + if !fieldInfo.IsRequired() { + continue + } + if fieldInfo.SequenceName() != "" || mutation.FieldHasDefaultInsertExpr(fieldInfo.Name) { + continue + } + if fd := info.Definition().Fields.ForName(fieldInfo.Name); fd != nil && + fd.Directives.ForName(base.FieldDefaultDirectiveName) != nil { + continue + } + return nil, fmt.Errorf("field %q is required for ingest into %q", fieldInfo.Name, info.Name) + } + + // Preserve Arrow stream column order for the SELECT list. The map above is + // only for validation and required-field checks. + columns := make([]ingestColumn, 0, len(byName)) + for _, af := range schema.Fields() { + columns = append(columns, byName[af.Name]) + } + return columns, nil +} + +func resolveIngestPermissionData(ctx context.Context, provider catalog.Provider, info *sdl.Object, mutationField *ast.FieldDefinition) (map[string]any, error) { + if auth.IsFullAccess(ctx) { + return nil, nil + } + rp := perm.PermissionsFromCtx(ctx) + if rp == nil { + return nil, nil + } + if rp.Disabled { + return nil, auth.ErrForbidden + } + + parent := sdl.ModuleTypeName(sdl.ObjectModule(info.Definition()), sdl.ModuleMutation) + if _, ok := rp.Enabled(parent, mutationField.Name); !ok { + return nil, auth.ErrForbidden + } + + arg := rp.DataArgument(ctx, parent, mutationField.Name) + if arg == nil { + return nil, nil + } + values, err := sdl.ParseDataAsInputObject(ctx, provider, &ast.Type{ + NamedType: info.InputInsertDataName(), + Position: base.CompiledPos("ingest permission data"), + }, arg, false) + if err != nil { + return nil, err + } + if values == nil { + return nil, nil + } + return values.(map[string]any), nil +} + +func checkIngestPermissions(ctx context.Context, provider catalog.Provider, info *sdl.Object, columns []ingestColumn, permissionData map[string]any) error { + if auth.IsFullAccess(ctx) { + return nil + } + rp := perm.PermissionsFromCtx(ctx) + if rp == nil { + return nil + } + if rp.Disabled { + return auth.ErrForbidden + } + + data := make(map[string]any, len(columns)+len(permissionData)) + for _, c := range columns { + data[c.InputDef.Name] = nil + } + for k, v := range permissionData { + data[k] = v + } + if err := rp.CheckMutationInput(ctx, provider, info.InputInsertDataName(), data); err != nil { + return err + } + return nil +} + +// ingestNode builds the INSERT ... SELECT statement that copies rows from the +// request-scoped DuckDB Arrow view into the target DB table. +// +// - info is the GraphQL data object plus its DB table/column mapping. +// - mutation is the GraphQL insert mutation used for insert defaults. +// - engine describes the target and optionally adapts canonical DuckDB +// staging values through EngineIngestValueAdapter. +// - columns are Arrow columns already resolved to GraphQL fields and DB +// columns by resolveIngestColumns. +// - permissionData contains extra GraphQL input values injected by the +// permission layer; they do not come from the Arrow stream. +// - arrowViewName is the globally unique DuckDB view registered from the +// Arrow reader for this ingest execution. +func ingestNode(ctx context.Context, info *sdl.Object, mutation *sdl.Mutation, engine engines.Engine, columns []ingestColumn, permissionData map[string]any, arrowViewName string) *QueryPlanNode { + return &QueryPlanNode{ + Name: "ingest_" + info.Name, + CollectFunc: func(node *QueryPlanNode, children Results, params []any) (string, []any, error) { + staging := engines.NewArrowIngestStagingBuilder() + // fieldValues is keyed by GraphQL field name. Each value is a SQL + // expression evaluated in the SELECT part of INSERT ... SELECT. + // The expression may reference an Arrow column from the ingest + // DuckDB view, or it may be a constant/default/permission value. + fieldValues := make(map[string]string, len(columns)) + for _, c := range columns { + // c.ArrowField.Name is the physical Arrow view column name. + // It is not necessarily the final DB column name; @field_source + // is applied later when targetFields is built. + value := engines.Ident(c.ArrowField.Name) + // Synthetic GraphQL field used only to pass type/directive + // metadata to the staging and optional target casters. + field := ingestASTField(info, c.Field, c.FieldDef) + // Normalize the Arrow value to a canonical DuckDB expression. + // Target-specific adaptation, if required, is applied below. + value, err := staging.SelectExpr(field, c.ArrowField, value) + if err != nil { + return "", nil, err + } + fieldValues[c.Field.Name] = value + } + for name, value := range permissionData { + // Permission data is addressed by GraphQL input/object field + // name, then converted to a staging SQL literal/expression. + fieldInfo := info.FieldForName(name) + if fieldInfo == nil { + return "", nil, fmt.Errorf("permission data field %q is not defined in data object %q", name, info.Name) + } + if fieldInfo.IsReferencesSubquery() || fieldInfo.IsNotDBField() { + return "", nil, fmt.Errorf("permission data field %q cannot be ingested directly", name) + } + fieldDef := info.Definition().Fields.ForName(name) + if fieldDef == nil { + return "", nil, fmt.Errorf("permission data field %q definition not found in data object %q", name, info.Name) + } + // Unlike Arrow columns, this value has no Arrow type. Build its + // canonical DuckDB literal before optional target adaptation. + sqlValue, err := staging.LiteralExpr(ingestASTField(info, fieldInfo, fieldDef), value) + if err != nil { + return "", nil, err + } + fieldValues[name] = sqlValue + } + // Arrow ingest SELECT expressions are evaluated by DuckDB because + // the Arrow view is registered in DuckDB. + // Default/auth helper expressions must therefore use the same canonical + // DuckDB staging types before optional target casting is applied below. + if err := mutation.AppendInsertSQLExpression(fieldValues, perm.AuthVars(ctx), staging); err != nil { + return "", nil, err + } + var targetFields, selectExprs []string + for _, c := range columns { + // targetFields are DB table columns. FieldSourceName applies the + // catalog mapping from GraphQL field name to physical DB column. + targetFields = append(targetFields, c.Field.FieldSourceName("", true)) + // selectExprs are evaluated from the DuckDB Arrow view and must + // stay in the same order as targetFields. + expr, err := adaptIngestValueSQL(engine, ingestASTField(info, c.Field, c.FieldDef), fieldValues[c.Field.Name]) + if err != nil { + return "", nil, err + } + selectExprs = append(selectExprs, expr) + delete(fieldValues, c.Field.Name) + } + for _, fieldInfo := range mutation.Fields() { + // Remaining fieldValues are values not backed by Arrow columns: + // permission data and default insert expressions. + expr, ok := fieldValues[fieldInfo.Name] + if !ok { + continue + } + if fieldInfo.FieldSourceName("", false) == "-" { + continue + } + fieldDef := info.Definition().Fields.ForName(fieldInfo.Name) + if fieldDef == nil { + return "", nil, fmt.Errorf("ingest field %q definition not found in data object %q", fieldInfo.Name, info.Name) + } + expr, err := adaptIngestValueSQL(engine, ingestASTField(info, fieldInfo, fieldDef), expr) + if err != nil { + return "", nil, err + } + targetFields = append(targetFields, fieldInfo.FieldSourceName("", true)) + selectExprs = append(selectExprs, expr) + delete(fieldValues, fieldInfo.Name) + } + if len(targetFields) == 0 { + return "", nil, fmt.Errorf("no values provided for ingest") + } + + target := info.SQL(ctx, engines.Ident(info.Catalog)) + // The FROM relation is this ingest request's globally unique Arrow view. + return fmt.Sprintf("INSERT INTO %s (%s) SELECT %s FROM %s", + target, + strings.Join(targetFields, ", "), + strings.Join(selectExprs, ", "), + engines.Ident(arrowViewName), + ), params, nil + }, + } +} + +func adaptIngestValueSQL(engine engines.Engine, field *ast.Field, valueSQL string) (string, error) { + adapter, ok := engine.(engines.EngineIngestValueAdapter) + if !ok { + return valueSQL, nil + } + return adapter.AdaptIngestValueSQL(field, valueSQL) +} + +func ingestASTField(info *sdl.Object, fieldInfo *sdl.Field, fieldDef *ast.FieldDefinition) *ast.Field { + return &ast.Field{ + Name: fieldInfo.Name, + Alias: fieldInfo.Name, + Definition: fieldDef, + ObjectDefinition: info.Definition(), + } +} diff --git a/pkg/planner/node_arrow_ingest_test.go b/pkg/planner/node_arrow_ingest_test.go new file mode 100644 index 00000000..315aa924 --- /dev/null +++ b/pkg/planner/node_arrow_ingest_test.go @@ -0,0 +1,39 @@ +package planner + +import ( + "testing" + + "github.com/hugr-lab/query-engine/pkg/engines" + "github.com/vektah/gqlparser/v2/ast" +) + +type testIngestValueAdapter struct { + *engines.DuckDB +} + +func (e *testIngestValueAdapter) AdaptIngestValueSQL(_ *ast.Field, valueSQL string) (string, error) { + return "adapted(" + valueSQL + ")", nil +} + +func TestAdaptIngestValueSQL(t *testing.T) { + t.Run("direct target", func(t *testing.T) { + got, err := adaptIngestValueSQL(engines.NewDuckDB(), nil, "staging_value") + if err != nil { + t.Fatal(err) + } + if got != "staging_value" { + t.Fatalf("got %q, want unchanged staging expression", got) + } + }) + + t.Run("value adapter", func(t *testing.T) { + engine := &testIngestValueAdapter{DuckDB: engines.NewDuckDB()} + got, err := adaptIngestValueSQL(engine, nil, "staging_value") + if err != nil { + t.Fatal(err) + } + if got != "adapted(staging_value)" { + t.Fatalf("got %q, want adapted ingest expression", got) + } + }) +} diff --git a/pkg/planner/planer.go b/pkg/planner/planer.go index 806d8f33..b17fb41a 100644 --- a/pkg/planner/planer.go +++ b/pkg/planner/planer.go @@ -4,9 +4,10 @@ import ( "context" "errors" + "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/pkg/catalog/sdl" + "github.com/hugr-lab/query-engine/pkg/db" "github.com/hugr-lab/query-engine/pkg/engines" - "github.com/hugr-lab/query-engine/pkg/catalog" "github.com/hugr-lab/query-engine/types" "github.com/vektah/gqlparser/v2/ast" ) @@ -65,3 +66,19 @@ func (s *Service) Plan(ctx context.Context, provider catalog.Provider, query *as return &QueryPlan{Query: query, RootNode: node}, nil } + +// PlanArrowIngest builds an INSERT-from-Arrow-view plan for the target data object. +// The Arrow source is part of this planning API because its schema drives column +// resolution and ingest casting, while its view name is the staging relation used +// in the generated INSERT ... SELECT. +func (s *Service) PlanArrowIngest(ctx context.Context, provider catalog.Provider, dataObject string, source db.ArrowIngestSource) (*QueryPlan, error) { + node, err := ingestRootNode(ctx, provider, s.engines, dataObject, source) + if err != nil { + return nil, err + } + node.provider = provider + node.engines = s.engines + node.querier = s.querier + + return &QueryPlan{RootNode: node}, nil +}