Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
58771a3
feat: add fts support
egolearner May 15, 2026
a3dfb4e
fix mac compile & ci
egolearner May 15, 2026
c922ccc
refactor parse fts & add fts debug text
egolearner May 18, 2026
9aad98c
fix some problems
egolearner May 18, 2026
47e3996
refactor(fts_column): reorganize into tokenizer/, posting/, iterator/…
egolearner May 19, 2026
75981f3
perf: or use multi_get
egolearner May 20, 2026
a203cc6
perf: optimize disjunction iterator
egolearner May 20, 2026
b9d2c7a
perf: fts use hashskiplist
egolearner May 20, 2026
f7a84ad
refactor batch_get_postings
egolearner May 20, 2026
7b474f5
perf: optimize iterator virtual function
egolearner May 21, 2026
714299a
bench limit max_queries
egolearner May 21, 2026
c16ebaf
perf: use PinnableSlice
egolearner May 21, 2026
d206ec6
perf: bitpacked avx2
egolearner May 21, 2026
0e4a65e
chore: rm unnecessary checkpoint
egolearner May 21, 2026
b008466
perf: cache block_max_info_for result to skip repeated binary searche…
egolearner May 21, 2026
d5bf0f5
perf: precompute BM25 IDF weight per term to eliminate log() from sco…
egolearner May 21, 2026
59199bb
perf: cache SIMD dispatch function pointers in iterator to eliminate …
egolearner May 21, 2026
ee1e159
rename
egolearner May 21, 2026
eff2666
perf: push filter down into FTS composite iterators
egolearner May 21, 2026
6c747d5
refactor: drop block-max helpers superseded by block_max_info_for
egolearner May 22, 2026
f435a07
perf: candidate-driven (brute-force) FTS evaluation
egolearner May 22, 2026
e4175eb
PartialMerge no optimize
egolearner May 22, 2026
24161cf
fix fts score
egolearner May 22, 2026
b97c581
python binding support fts
egolearner May 21, 2026
31d4a57
perf: open BitPacked posting iterator only once per term
egolearner May 22, 2026
583b46b
feat: wire FTS reduce into Optimize compaction
egolearner May 25, 2026
c657e25
feat: allow schema without vector fields
egolearner May 25, 2026
9763a7c
rename cpp FtsQuery to Fts and fix compile
egolearner May 25, 2026
b071137
add c binding
egolearner May 25, 2026
6bc6575
fix: tokenize FTS query_string phrase/term through pipeline
egolearner May 26, 2026
1cd7d1c
refactor: bypass cppjieba::Jieba to drop KeywordExtractor
egolearner May 26, 2026
1bb43cb
feat: return EmptyNode instead of erroring for zero-token FTS queries
egolearner May 26, 2026
915de78
feat: dedup repeated FTS terms via AST rewriter with linear boost
egolearner May 26, 2026
87c24cf
refactor: canonicalize OR-with-must_not into AND wrapper in FTS rewriter
egolearner May 26, 2026
89dcac9
perf: batch FTS phrase position reads with MultiGet and shortest-list…
egolearner May 26, 2026
ee9bef9
fix some problems
egolearner May 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Auto-generated files — collapsed in GitHub PR diffs
src/db/index/column/fts_column/gen/** linguist-generated=true
src/db/sqlengine/antlr/gen/** linguist-generated=true
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,12 @@
[submodule "thirdparty/RaBitQ-Library/RaBitQ-Library-0.1"]
path = thirdparty/RaBitQ-Library/RaBitQ-Library-0.1
url = https://github.com/VectorDB-NTU/RaBitQ-Library.git
[submodule "thirdparty/cppjieba/cppjieba-5.6.7"]
path = thirdparty/cppjieba/cppjieba-5.6.7
url = https://github.com/yanyiwu/cppjieba.git
[submodule "thirdparty/FastPFOR/FastPFOR-0.4.0"]
path = thirdparty/FastPFOR/FastPFOR-0.4.0
url = https://github.com/fast-pack/FastPFOR.git
[submodule "thirdparty/limonp/limonp-v1.0.2"]
path = thirdparty/limonp/limonp-v1.0.2
url = https://github.com/yanyiwu/limonp.git
188 changes: 188 additions & 0 deletions python/tests/test_collection_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# Copyright 2025-present the zvec project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""End-to-end tests for FTS-only collections (no vector field).

The schema validation rule "must have at least one vector field" has been
lifted; these tests pin the new behavior so insert / query / delete /
optimize all work on a vector-less collection.
"""

from __future__ import annotations

import pytest
import zvec
from zvec import (
Collection,
CollectionOption,
DataType,
Doc,
FieldSchema,
FtsIndexParam,
OptimizeOption,
)
from zvec.model.param.query import Fts, Query


# ==================== Fixtures ====================


@pytest.fixture(scope="function")
def fts_collection(tmp_path_factory) -> Collection:
"""FTS-only collection: a STRING field for forward + an FTS-indexed STRING."""
temp_dir = tmp_path_factory.mktemp("zvec_fts_only")
collection_path = temp_dir / "fts_collection"

schema = zvec.CollectionSchema(
name="fts_only",
fields=[
FieldSchema("title", DataType.STRING, nullable=False),
FieldSchema(
"content",
DataType.STRING,
nullable=False,
index_param=FtsIndexParam(
tokenizer_name="standard",
filters=["lowercase"],
),
),
],
# vectors omitted on purpose — schema validation must accept this.
)

coll = zvec.create_and_open(
path=str(collection_path),
schema=schema,
option=CollectionOption(read_only=False, enable_mmap=True),
)
assert coll is not None

try:
yield coll
finally:
try:
coll.destroy()
except Exception as e:
print(f"Warning: failed to destroy collection: {e}")


def _make_docs() -> list[Doc]:
"""5-doc corpus where 4 contain 'hello' and doc 4 is the only outlier."""
return [
Doc(id="pk_0", fields={"title": "intro", "content": "hello world"}),
Doc(id="pk_1", fields={"title": "guide", "content": "hello foo bar"}),
Doc(id="pk_2", fields={"title": "tips", "content": "hello baz"}),
Doc(id="pk_3", fields={"title": "more", "content": "hello hello"}),
Doc(id="pk_4", fields={"title": "other", "content": "nothing relevant"}),
]


def _fts_query(coll: Collection, term: str) -> list[Doc]:
"""Run a single-term FTS match query against the `content` field."""
return coll.query(
queries=Query(field_name="content", fts=Fts(match_string=term)),
topk=10,
)


# ==================== Tests ====================


class TestFtsOnlyCollectionSchema:
def test_create_and_open_without_vectors(self, fts_collection: Collection):
"""Schema with zero vector fields must be accepted by validate()."""
assert fts_collection.schema.name == "fts_only"
assert {f.name for f in fts_collection.schema.fields} == {"title", "content"}
# Empty vectors is the whole point of the test.
assert list(fts_collection.schema.vectors) == []
assert fts_collection.stats.doc_count == 0

def test_create_schema_omitting_vectors_kwarg(self):
"""Constructing CollectionSchema without `vectors=` argument is valid."""
schema = zvec.CollectionSchema(
name="bare_fts",
fields=[
FieldSchema(
"content",
DataType.STRING,
nullable=False,
index_param=FtsIndexParam(),
),
],
)
assert list(schema.vectors) == []
assert {f.name for f in schema.fields} == {"content"}


class TestFtsOnlyCollectionLifecycle:
def test_insert_and_fts_query(self, fts_collection: Collection):
"""FTS-only collection supports insert + FTS query end-to-end."""
results = fts_collection.insert(_make_docs())
assert all(r.ok() for r in results)
assert fts_collection.stats.doc_count == 5

hits = _fts_query(fts_collection, "hello")
assert len(hits) == 4
assert {doc.id for doc in hits} == {"pk_0", "pk_1", "pk_2", "pk_3"}

# Term that nothing in the surviving corpus contains.
assert _fts_query(fts_collection, "missing_term_xyz") == []

def test_delete_then_query(self, fts_collection: Collection):
"""Tombstone filter must drop deleted docs from FTS results."""
fts_collection.insert(_make_docs())
statuses = fts_collection.delete(["pk_0", "pk_4"])
assert all(s.ok() for s in statuses)
assert fts_collection.stats.doc_count == 3

hits = _fts_query(fts_collection, "hello")
assert len(hits) == 3
assert {doc.id for doc in hits} == {"pk_1", "pk_2", "pk_3"}
# pk_4's unique term is filtered out post-delete.
assert _fts_query(fts_collection, "nothing") == []

def test_optimize_rebuilds_fts(self, fts_collection: Collection):
"""Optimize with >30% deletes triggers ReduceFts; recall unchanged."""
fts_collection.insert(_make_docs())
# 40% delete ratio — above COMPACT_DELETE_RATIO_THRESHOLD=0.3, so
# build_compact_task picks the rebuild path and ReduceFts runs.
fts_collection.delete(["pk_0", "pk_4"])

before = {doc.id for doc in _fts_query(fts_collection, "hello")}
assert before == {"pk_1", "pk_2", "pk_3"}

fts_collection.optimize(option=OptimizeOption())
assert fts_collection.stats.doc_count == 3

after = {doc.id for doc in _fts_query(fts_collection, "hello")}
assert after == before
assert _fts_query(fts_collection, "nothing") == []


class TestFtsOnlyCollectionQueryValidation:
def test_vector_query_rejected(self, fts_collection: Collection):
"""Vector query on a no-vector collection must raise."""
with pytest.raises(ValueError, match="vector or id"):
fts_collection.query(
queries=Query(field_name="content", vector=[0.1, 0.2, 0.3]),
topk=5,
)

def test_id_query_rejected(self, fts_collection: Collection):
"""ID-based query on a no-vector collection must raise."""
fts_collection.insert(_make_docs()[:1])
with pytest.raises(ValueError, match="vector or id"):
fts_collection.query(
queries=Query(field_name="content", id="pk_0"),
topk=5,
)
158 changes: 158 additions & 0 deletions python/tests/test_fts_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Copyright 2025-present the zvec project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for FTS (Full-Text Search) query support in the Python SDK."""

import pickle

import pytest

from zvec.model.param.query import Fts, Query
Copy link
Copy Markdown
Collaborator

@JalinWang JalinWang May 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This naming "Fts" is a little bit too generic. Would it be more precise to name it after its underlying dependency, like _FtsQuery (binding) or FtsQueryParam (C++)?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current design is: Query is the top-level request containing multiple retrieval fields (vector, FTS, filter, etc.), and Fts is just one component within it — similar to how you'd have Vector or Filter as sibling fields.

Naming it FtsQuery would imply it's a standalone query type at the same level as Query, which is misleading. If we later split Query into multiple specialized types, FtsQuery would then be the right name for a full FTS query request.

We'll also align the C++ side to drop the "Query" suffix from the param struct for consistency.

What do you think? Any other naming suggestions?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You’re right — FtsQuery doesn’t feel quite right.

That said, Fts alone also feels a bit awkward, since it doesn’t make it clear that this is a component of Query, not a standalone type.

Maybe we should consider a name that better reflects that role, such as FtsQueryParam, FtsClause, FtsCondition, FtsOption, or FtsFilter.



class TestFtsQueryValidation:
"""Test FTS parameter validation in Query dataclass."""

def test_fts_query_string_only(self):
"""Query with only query_string in Fts should be valid."""
q = Query(
field_name="content", fts=Fts(query_string='+hello -world "exact phrase"')
)
q._validate()
assert q.fts.query_string == '+hello -world "exact phrase"'
assert q.fts.match_string is None
assert q.has_fts() is True

def test_fts_match_string_only(self):
"""Query with only match_string in Fts should be valid."""
q = Query(field_name="content", fts=Fts(match_string="machine learning"))
q._validate()
assert q.fts.match_string == "machine learning"
assert q.fts.query_string is None
assert q.has_fts() is True

def test_fts_query_string_and_match_string_mutually_exclusive(self):
"""Cannot provide both query_string and match_string in Fts."""
q = Query(
field_name="content",
fts=Fts(query_string="+hello", match_string="hello world"),
)
with pytest.raises(ValueError, match="mutually exclusive"):
q._validate()

def test_no_fts(self):
"""Query without FTS fields should have has_fts() == False."""
q = Query(field_name="embedding", vector=[0.1, 0.2, 0.3])
assert q.has_fts() is False

def test_vector_and_fts_mutually_exclusive(self):
"""Cannot combine vector search with FTS in a single Query."""
q = Query(
field_name="embedding",
vector=[0.1, 0.2, 0.3],
fts=Fts(match_string="deep learning"),
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, would it be better to rename fts= here to something more explicit, like full_text= / full_text_search= / text_search=? It feels a bit clearer and makes the intent more obvious, especially for users who may not immediately recognize fts as full-text search.

)
with pytest.raises(ValueError, match="Cannot combine fts with vector search"):
q._validate()

def test_fts_without_vector_or_id(self):
"""Query with only FTS (no vector, no id) should be valid."""
q = Query(field_name="content", fts=Fts(query_string="hello"))
q._validate()
assert q.has_vector() is False
assert q.has_id() is False
assert q.has_fts() is True


class TestFtsQueryBinding:
"""Test FTS binding layer (_Fts)."""

def test_import_fts_query(self):
"""_Fts should be importable from _zvec.param."""
from _zvec.param import _Fts

fts = _Fts()
assert fts.query_string == ""
assert fts.match_string == ""

def test_fts_query_set_fields(self):
"""Setting fields on _Fts should work."""
from _zvec.param import _Fts

fts = _Fts()
fts.query_string = "+hello -world"
assert fts.query_string == "+hello -world"

fts2 = _Fts()
fts2.match_string = "machine learning"
assert fts2.match_string == "machine learning"

def test_fts_query_pickle(self):
"""_Fts should support pickling."""
from _zvec.param import _Fts

fts = _Fts()
fts.query_string = "+vector search"
fts.match_string = ""

data = pickle.dumps(fts)
restored = pickle.loads(data)
assert restored.query_string == "+vector search"
assert restored.match_string == ""

def test_vector_query_fts_field(self):
"""_VectorQuery should have fts field."""
from _zvec.param import _Fts, _VectorQuery

vq = _VectorQuery()
# fts should be None by default (optional)
assert vq.fts is None

# set fts
fts = _Fts()
fts.query_string = "hello"
vq.fts = fts
assert vq.fts is not None
assert vq.fts.query_string == "hello"

def test_vector_query_pickle_with_fts(self):
"""_VectorQuery with fts should survive pickling."""
from _zvec.param import _Fts, _VectorQuery

vq = _VectorQuery()
vq.topk = 10
vq.field_name = "embedding"
fts = _Fts()
fts.match_string = "test query"
vq.fts = fts

data = pickle.dumps(vq)
restored = pickle.loads(data)
assert restored.topk == 10
assert restored.field_name == "embedding"
assert restored.fts is not None
assert restored.fts.match_string == "test query"

def test_vector_query_pickle_without_fts(self):
"""_VectorQuery without fts should survive pickling."""
from _zvec.param import _VectorQuery

vq = _VectorQuery()
vq.topk = 5
vq.field_name = "vec"

data = pickle.dumps(vq)
restored = pickle.loads(data)
assert restored.topk == 5
assert restored.field_name == "vec"
assert restored.fts is None
4 changes: 3 additions & 1 deletion python/tests/test_query_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,9 @@ def test_init(self):
def test_do_validate_with_queries(self):
schema = MockCollectionSchema()
executor = NoVectorQueryExecutor(schema)
ctx = QueryContext(topk=10, queries=[Query(field_name="test")])
ctx = QueryContext(
topk=10, queries=[Query(field_name="test", vector=[0.1, 0.2, 0.3])]
)

with pytest.raises(
ValueError, match="Collection does not support query with vector or id"
Expand Down
5 changes: 4 additions & 1 deletion python/zvec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,14 @@
from .model.doc import Doc

# —— Query & index parameters ——
# —— FTS params (C++ binding) ——
from .model.param import (
AddColumnOption,
AlterColumnOption,
CollectionOption,
FlatIndexParam,
FtsIndexParam,
FtsQueryParam,
HnswIndexParam,
HnswQueryParam,
HnswRabitqIndexParam,
Expand All @@ -73,7 +76,7 @@
VamanaIndexParam,
VamanaQueryParam,
)
from .model.param.query import Query, VectorQuery
from .model.param.query import Fts, Query, VectorQuery

# —— Schema & field definitions ——
from .model.schema import CollectionSchema, CollectionStats, FieldSchema, VectorSchema
Expand Down
Loading
Loading