Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions src/couch/priv/couch_ejson_compare/couch_ejson_compare.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ static ErlNifMutex* collMutex = NULL;

static ERL_NIF_TERM less_json_nif(ErlNifEnv*, int, const ERL_NIF_TERM []);
static ERL_NIF_TERM compare_strings_nif(ErlNifEnv*, int, const ERL_NIF_TERM []);
static ERL_NIF_TERM get_sort_key_nif(ErlNifEnv*, int, const ERL_NIF_TERM []);
static ERL_NIF_TERM get_icu_version(ErlNifEnv*, int, const ERL_NIF_TERM []);
static ERL_NIF_TERM get_uca_version(ErlNifEnv*, int, const ERL_NIF_TERM []);
static ERL_NIF_TERM get_collator_version(ErlNifEnv*, int, const ERL_NIF_TERM []);
Expand Down Expand Up @@ -174,6 +175,81 @@ compare_strings_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
}


/*
* Return an ICU collation sort key as an Erlang binary. Two keys that collate
* equally produce identical sort keys. Sort keys can be used to keep a bunch
* of ICU key-value pairs in collation order (in a sorted KV data structure for
* example) and minimize the times ICU pair-wise comparison function would be
* called when keeping that data structure sorted. The only caveat is not to
* compare sort keys generated by different major versions of libicu, so use
* them on the same node in memory and don't store them on disk.
*/
ERL_NIF_TERM
get_sort_key_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
ErlNifBinary bin;
UCollator* coll;
UErrorCode status = U_ZERO_ERROR;
UCharIterator iter;
/* Part of libicu "Between calls to the API you need to save a 64-bit state"
* see https://unicode-org.github.io/icu/userguide/collation/api.html */
uint32_t state[2] = {0, 0};
/* This our stack cache */
uint8_t keystack[256];
uint8_t* key = keystack;
int32_t keycap = (int32_t) sizeof(keystack);
int32_t keylen = 0;
unsigned char* out;
ERL_NIF_TERM result;

if (!enif_inspect_binary(env, argv[0], &bin)) {
return enif_make_badarg(env);
}

coll = get_collator();
if (coll == NULL) {
return enif_make_badarg(env);
}

uiter_setUTF8(&iter, (const char*) bin.data, (uint32_t) bin.size);

/* At first use a short 256 stack cache to fill the key in. If that gets
* too small start allocating memory. If we get less than our buffer size
* it means we're done. */
for (;;) {
int32_t want = keycap - keylen;
int32_t got = ucol_nextSortKeyPart(coll, &iter, state, key + keylen, want, &status);
if (U_FAILURE(status)) {
if (key != keystack) {
enif_free(key);
}
return enif_make_badarg(env);
}
keylen += got;
if (got < want) {
break;
}
int32_t newcap = keycap * 2;
if (key == keystack) {
key = enif_alloc(newcap);
memcpy(key, keystack, keylen);
} else {
key = enif_realloc(key, newcap);
}
keycap = newcap;
}
/* Note: this crashes when out of memory */
out = enif_make_new_binary(env, keylen, &result);
memcpy(out, key, keylen);

if (key != keystack) {
enif_free(key);
}

return result;
}


ERL_NIF_TERM
get_icu_version(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
Expand Down Expand Up @@ -555,6 +631,7 @@ on_unload(ErlNifEnv* env, void* priv_data)
static ErlNifFunc nif_functions[] = {
{"less_nif", 2, less_json_nif},
{"compare_strings_nif", 2, compare_strings_nif},
{"get_sort_key_nif", 1, get_sort_key_nif},
{"get_icu_version", 0, get_icu_version},
{"get_uca_version", 0, get_uca_version},
{"get_collator_version", 0, get_collator_version}
Expand Down
33 changes: 32 additions & 1 deletion src/couch/src/couch_ejson_compare.erl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
less/2,
less_json_ids/2,
less_json/2,
sort_key/1,
get_icu_version/0,
get_uca_version/0,
get_collator_version/0
Expand All @@ -25,7 +26,8 @@
-export([
less_nif/2,
less_erl/2,
compare_strings_nif/2
compare_strings_nif/2,
get_sort_key_nif/1
]).

-on_load(init/0).
Expand Down Expand Up @@ -58,6 +60,35 @@ less_json_ids({JsonA, IdA}, {JsonB, IdB}) ->
less_json(A, B) ->
less(A, B) < 0.

% Encode ejson to terms with native `less` ordering matching less/2 ICU
% collation order. The leading integer is the ejson object rank see [1]. The
% collection order is:
%
% null < false < true < num < str < array < object
%
% Sort keys should not be stored or compared against sort key generated by
% other major libicu version. The intent so to use these only at runtime, in
% memory on the same node (on the coordinator mostly likely when merge-sorting
% incoming view rows).
%
% [1] https://docs.couchdb.org/en/stable/ddocs/views/collation.html#collation-specification
%
sort_key(null) -> {0};
sort_key(false) -> {1};
sort_key(true) -> {2};
sort_key(N) when is_number(N) -> {3, N};
sort_key(B) when is_binary(B) -> {4, get_sort_key_nif(B)};
sort_key(L) when is_list(L) -> {5, [sort_key(E) || E <- L]};
sort_key({P}) when is_list(P) -> {6, [{sort_key(K), sort_key(V)} || {K, V} <- P]};
% {<<255,255,255,255>>} sentinel (?MAX_JSON_OBJ max key) sorts above all json
% so give it the highest value. This may be a belt-and-suspenders just in case.
% In practice we should never emit that object but might see it as part of a
% query.
sort_key({<<255, 255, 255, 255>>}) -> {7, []}.

get_sort_key_nif(_A) ->
erlang:nif_error(get_sort_key_nif_load_error).

get_icu_version() ->
erlang:nif_error(get_icu_version).

Expand Down
102 changes: 102 additions & 0 deletions src/couch/test/eunit/couch_ejson_compare_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,18 @@ prop_any_json_is_less_than_max_json() ->
less(V, ?MAX_JSON_OBJ) =:= -1
end).

% ?MAX_JSON_OBJ (used in Mango) sorts the highest
prop_sort_key_test_values_less_than_max() ->
?FORALL(V, oneof(?TEST_VALUES), begin
sort_key_cmp(V, ?MAX_JSON_OBJ) =:= -1
end).

% Any json value sorts lower than ?MAX_JSON_OBJ
prop_sort_key_any_json_less_than_max() ->
?FORALL(V, json(), begin
sort_key_cmp(V, ?MAX_JSON_OBJ) =:= -1
end).

% In general, for any json, the nif collator matches the erlang collator
prop_nif_matches_erlang() ->
?FORALL(
Expand All @@ -114,6 +126,48 @@ prop_nif_matches_erlang() ->
end)
).

% Check sort key orders json values same as as less
prop_sort_key_matches_less() ->
?FORALL(
A,
json(),
?FORALL(B, json(), begin
sort_key_cmp(A, B) =:= less(A, B)
end)
).

% Sorting a list with sort key is the same as sorting it with less
prop_sort_key_sorts_like_less() ->
?FORALL(L, list(json()), begin
ByLess = lists:sort(fun(A, B) -> couch_ejson_compare:less(A, B) =< 0 end, L),
BySortKey = lists:sort(
fun(A, B) -> couch_ejson_compare:sort_key(A) =< couch_ejson_compare:sort_key(B) end, L
),
ByLess =:= BySortKey
end).

% Specifically check unicode strings. (The general idea here is we we'd like to
% spend our "randomizaton budget" exploring unicode strings more than various
% term shapes).
prop_sort_key_nif_matches_less() ->
?FORALL(
A,
sort_key_string(),
?FORALL(B, sort_key_string(), begin
sort_key_nif_cmp(A, B) =:= less(A, B)
end)
).

% Extra check that grouping works. Surround the value various zero-width
% characters and ensure sort string is the same as without them. In other words
% we'd group things like this together.
prop_sort_key_equivalent_strings() ->
?FORALL({Prefix, Suffix}, {zero_width_list(), zero_width_list()}, begin
Binary = unicode:characters_to_binary(Prefix ++ [$a] ++ Suffix),
SortKey = couch_ejson_compare:get_sort_key_nif(<<"a">>),
SortKey =:= couch_ejson_compare:get_sort_key_nif(Binary)
end).

% Generators

json() ->
Expand Down Expand Up @@ -164,6 +218,24 @@ zero_width_list() ->
zero_width_chars() ->
oneof([16#200B, 16#200C, 16#200D]).

% Besides handling json string we also handle ?MAX_UNICODE_STRING (the
% <<255,255,255,255>> to sorting values so make we mix that top sorting value
% into our values we pass into the ICU library. It should handle them as of ICU
% version >= 59
sort_key_string() ->
oneof([json_string(), ?MAX_UNICODE_STRING]).

sort_key_cmp(A, B) ->
term_cmp(couch_ejson_compare:sort_key(A), couch_ejson_compare:sort_key(B)).

sort_key_nif_cmp(A, B) ->
term_cmp(couch_ejson_compare:get_sort_key_nif(A), couch_ejson_compare:get_sort_key_nif(B)).

% Helper to return the same shape as less/2
term_cmp(A, B) when A < B -> -1;
term_cmp(A, B) when A > B -> 1;
term_cmp(_, _) -> 0.

-else.

-include_lib("couch/include/couch_eunit.hrl").
Expand Down Expand Up @@ -238,6 +310,36 @@ compare_strings_nif_test() ->
?assertError(badarg, compare_strings(<<"a">>, 42)),
?assertError(badarg, compare_strings(42, 42)).

% Here we test sort key can handle keys larger than the internal 256 stack
% buffer just so we can get some coverage there.
get_sort_key_nif_large_test() ->
Small = binary:copy(<<"a">>, 16),
Large = binary:copy(<<"a">>, 1000),
SmallKey = couch_ejson_compare:get_sort_key_nif(Small),
LargeKey = couch_ejson_compare:get_sort_key_nif(Large),

?assert(byte_size(SmallKey) =< 256),
?assert(byte_size(LargeKey) > 512),

% Check the large heap path against less/2 just for belt and suspenders
Larger = <<Large/binary, "b">>,
?assertEqual(-1, less(Large, Larger)),
?assert(LargeKey < couch_ejson_compare:get_sort_key_nif(Larger)),

% Adding a lot of zero width junk to a large string still works
ZeroWidth = binary:copy(<<16#E2, 16#80, 16#8B>>, 300),
Equiv = <<Large/binary, ZeroWidth/binary>>,
?assertEqual(0, less(Large, Equiv)),
?assertEqual(LargeKey, couch_ejson_compare:get_sort_key_nif(Equiv)).

sort_key_max_json_obj_test() ->
Max = couch_ejson_compare:sort_key(?MAX_JSON_OBJ),
?assertEqual(Max, couch_ejson_compare:sort_key(?MAX_JSON_OBJ)),
lists:foreach(
fun(V) -> ?assertEqual(-1, sort_key_cmp(V, ?MAX_JSON_OBJ)) end,
[null, false, true, 42, <<"z">>, ?MAX_UNICODE_STRING, [1, 2], {[{<<"a">>, 1}]}]
).

% Helper functions

less(A, B) ->
Expand Down
Loading