diff --git a/mabel/data/internals/group_by.py b/mabel/data/internals/group_by.py index 995a95c..a6ac668 100644 --- a/mabel/data/internals/group_by.py +++ b/mabel/data/internals/group_by.py @@ -7,7 +7,7 @@ """ from collections import defaultdict -from siphashc import siphash +from xxhash import xxh3_64_intdigest def summer(x, y): @@ -25,7 +25,7 @@ def summer(x, y): "AVG": lambda x, y: 1, } -HASH_SEED = b"Anakin Skywalker" +HASH_SEED = 42 class TooManyGroups(Exception): @@ -72,14 +72,14 @@ def _map(self, collect_columns): for record in self._dictset: try: - group_key: int = siphash( - HASH_SEED, + group_key: int = xxh3_64_intdigest( "".join([str(record[column]) for column in self._columns]), + HASH_SEED ) except KeyError: - group_key: int = siphash( - HASH_SEED, + group_key: int = xxh3_64_intdigest( "".join([f"{record.get(column, '')}" for column in self._columns]), + HASH_SEED, ) if group_key not in self._group_keys.keys(): self._group_keys[group_key] = [ diff --git a/mabel/data/readers/internals/cursor.py b/mabel/data/readers/internals/cursor.py index eb1268f..8d6ab80 100644 --- a/mabel/data/readers/internals/cursor.py +++ b/mabel/data/readers/internals/cursor.py @@ -3,7 +3,7 @@ Cursor is made of three parts: - map : a bit array representing all of the blobs in the set - unread blobs - are 0s and read blobs are 1s. This allows for blobs to be read in + are 0s and read blobs are 1s. This allows for blobs to be read in an arbitrary order - although currently only implemented linearly. - partition: the active parition (blob) that is being read - location : the record in the active partition (blob), so we can resume reading @@ -11,7 +11,7 @@ """ import orjson -from orso.cityhash import CityHash64 +from xxhash import xxh3_64_intdigest class InvalidCursor(Exception): @@ -47,7 +47,9 @@ def load_cursor(self, cursor): self.location = cursor["location"] find_partition = [ - blob for blob in self.readable_blobs if CityHash64(blob) == cursor["partition"] + blob + for blob in self.readable_blobs + if xxh3_64_intdigest(blob, 0) == cursor["partition"] ] if len(find_partition) == 1: self.partition = find_partition[0] @@ -67,7 +69,7 @@ def next_blob(self, previous_blob=None): if self.partition in self.readable_blobs: return self.partition partition_finder = [ - blob for blob in self.readable_blobs if CityHash64(blob) == self.partition + blob for blob in self.readable_blobs if xxh3_64_intdigest(blob, 0) == self.partition ] if len(partition_finder) != 1: raise ValueError(f"Unable to determine current partition ({self.partition})") @@ -103,7 +105,7 @@ def __getitem__(self, item): ) return blob_map.tobytes().hex() if item == "partition": - return CityHash64(self.partition) + return xxh3_64_intdigest(self.partition, 0) if item == "location": return self.location return None diff --git a/mabel/data/readers/internals/threaded_wrapper.py b/mabel/data/readers/internals/threaded_wrapper.py index 4aa5f7d..05dc9be 100644 --- a/mabel/data/readers/internals/threaded_wrapper.py +++ b/mabel/data/readers/internals/threaded_wrapper.py @@ -1,6 +1,4 @@ -""" - -""" +""" """ import logging import threading diff --git a/mabel/data/validator/__init__.py b/mabel/data/validator/__init__.py index e9c4d87..a1ad905 100644 --- a/mabel/data/validator/__init__.py +++ b/mabel/data/validator/__init__.py @@ -11,7 +11,7 @@ def schema_loader( - definition: Union[str, List[Dict[str, Any]], dict, RelationSchema, bool] + definition: Union[str, List[Dict[str, Any]], dict, RelationSchema, bool], ) -> Union[RelationSchema, bool]: if definition is None: raise ValueError( diff --git a/mabel/data/writers/internals/blob_writer.py b/mabel/data/writers/internals/blob_writer.py index 4adece5..2ad72a9 100644 --- a/mabel/data/writers/internals/blob_writer.py +++ b/mabel/data/writers/internals/blob_writer.py @@ -165,7 +165,7 @@ def commit(self): pytable = self._normalize_arrow_schema(pytable, self.schema) tempfile = io.BytesIO() - pyarrow.parquet.write_table(pytable, where=tempfile, compression="zstd") + pyarrow.parquet.write_table(pytable, where=tempfile) tempfile.seek(0) write_buffer = tempfile.read() diff --git a/mabel/utils/dates.py b/mabel/utils/dates.py index 1e35645..61ffe07 100644 --- a/mabel/utils/dates.py +++ b/mabel/utils/dates.py @@ -41,7 +41,7 @@ def parse_delta(delta: str) -> datetime.timedelta: def parse_iso( - value: Union[str, int, float, datetime.datetime, datetime.date] + value: Union[str, int, float, datetime.datetime, datetime.date], ) -> Optional[datetime.datetime]: """ Parses an ISO date string into a datetime object, with an emphasis on speed. diff --git a/mabel/version.py b/mabel/version.py index a98c243..41ac715 100644 --- a/mabel/version.py +++ b/mabel/version.py @@ -1,6 +1,6 @@ # Store the version here so: # 1) we don't load dependencies by storing it in __init__.py # 2) we can import it in setup.py for the same reason -__version__ = "0.6.24" +__version__ = "0.6.25" # nodoc - don't add to the documentation wiki diff --git a/requirements.txt b/requirements.txt index ce8d42d..d7aa106 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,5 @@ bitarray lz4 orjson orso>=0.0.147 -siphashc +xxhash zstandard \ No newline at end of file diff --git a/tests/performance/index_performance.py b/tests/performance/index_performance.py index a872997..442cb6a 100644 --- a/tests/performance/index_performance.py +++ b/tests/performance/index_performance.py @@ -1,11 +1,11 @@ """ Results (seconds to search for a username in 65,500 rows): - indexed | row exists | time + indexed | row exists | time ------------------------------- yes | yes | 0.094 <- about 3.5x faster when is match yes | no | 0.006 <- over 50x faster when no match - no | yes | 0.357 + no | yes | 0.357 no | no | 0.332 ------------------------------- """ diff --git a/tests/performance/indexing.py b/tests/performance/indexing.py index d6d8dc7..a4cc644 100644 --- a/tests/performance/indexing.py +++ b/tests/performance/indexing.py @@ -1,6 +1,4 @@ -""" - -""" +""" """ import sys import os diff --git a/tests/performance/json_performance.py b/tests/performance/json_performance.py index c90af1e..9d161ef 100644 --- a/tests/performance/json_performance.py +++ b/tests/performance/json_performance.py @@ -1,13 +1,13 @@ """ -JSON parsing and serialization performance tests so a decision on +JSON parsing and serialization performance tests so a decision on which library(s) to use can be made - previously the selection was inconsistent. Results (seconds to process 10m rows): - library | parsing | serialize + library | parsing | serialize ------------------------------- - json | 36.6 | 1.74 + json | 36.6 | 1.74 ujson | 16.5 | 0.86 orjson | 10.4 | 0.66 <- lower is better simd | 2.8 | N/A <- lower is better diff --git a/tests/test_data_dictset.py b/tests/test_data_dictset.py index d0807af..f44306b 100644 --- a/tests/test_data_dictset.py +++ b/tests/test_data_dictset.py @@ -2,6 +2,7 @@ import sys sys.path.insert(1, os.path.join(sys.path[0], "..")) + from mabel import Reader, DictSet from mabel.data import STORAGE_CLASS from mabel.adapters.disk import DiskReader @@ -189,7 +190,7 @@ def test_hash(): ] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) hashval = hash(ds) - assert hashval == 8826660322425604498, hashval + assert hashval == 386528107484878589, hashval def test_sort(): @@ -210,6 +211,6 @@ def test_sort(): if __name__ == "__main__": # pragma: no cover - from tests.helpers.runner import run_tests + from helpers.runner import run_tests run_tests() diff --git a/tests/test_data_group_by.py b/tests/test_data_group_by.py index 3d95c6c..28013ce 100644 --- a/tests/test_data_group_by.py +++ b/tests/test_data_group_by.py @@ -38,17 +38,24 @@ def test_group_by(): gb = GroupBy(ds, "user") cn = gb.count() ls = list(cn) - assert ls == [{'COUNT(*)': 6, 'user': 'alice'}, {'COUNT(*)': 5, 'user': 'bob'}, {'COUNT(*)': 2, 'user': 'eve'}], ls + expected = [{'COUNT(*)': 6, 'user': 'alice'}, {'COUNT(*)': 5, 'user': 'bob'}, {'COUNT(*)': 2, 'user': 'eve'}] + assert set(tuple(sorted(d.items())) for d in ls) == set(tuple(sorted(d.items())) for d in expected) ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gb = list(GroupBy(ds, "user").average("value")) - assert gb == [{'AVG(value)': Decimal("4.0"), 'user': 'alice'}, {'AVG(value)': Decimal("1.4"), 'user': 'bob'}, {'AVG(value)': Decimal("6.5"), 'user': 'eve'}], gb + expected = [{'AVG(value)': Decimal("4.0"), 'user': 'alice'}, {'AVG(value)': Decimal("1.4"), 'user': 'bob'}, {'AVG(value)': Decimal("6.5"), 'user': 'eve'}] + assert set(tuple(sorted(d.items())) for d in gb) == set(tuple(sorted(d.items())) for d in expected) ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) - assert list(GroupBy(ds, "user").max("value")) == [{'MAX(value)': 5, 'user': 'alice'}, {'MAX(value)': 2, 'user': 'bob'}, {'MAX(value)': 7, 'user': 'eve'}] + gs = list(GroupBy(ds, "user").max("value")) + expected = [{'MAX(value)': 5, 'user': 'alice'}, {'MAX(value)': 2, 'user': 'bob'}, {'MAX(value)': 7, 'user': 'eve'}] + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) - assert list(GroupBy(ds, "user").min("value")) == [{'MIN(value)': 3, 'user': 'alice'}, {'MIN(value)': 1, 'user': 'bob'}, {'MIN(value)': 6, 'user': 'eve'}] + gs = list(GroupBy(ds, "user").min("value")) + expected = [{'MIN(value)': 3, 'user': 'alice'}, {'MIN(value)': 1, 'user': 'bob'}, {'MIN(value)': 6, 'user': 'eve'}] + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) + # fmt:on @@ -75,27 +82,34 @@ def test_combined_group_by(): # fmt:off ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).groups()) - assert gs == [{'fname': 'bob', 'sname': 'smith'}, {'fname': 'bob', 'sname': 'jones'}, {'fname': 'alice', 'sname': 'jones'}, {'fname': 'alice', 'sname': 'smith'}, {'fname': 'eve', 'sname': 'jones'}, {'fname': 'eve', 'sname': 'smith'}], gs + expected = [{'fname': 'bob', 'sname': 'smith'}, {'fname': 'bob', 'sname': 'jones'}, {'fname': 'alice', 'sname': 'jones'}, {'fname': 'alice', 'sname': 'smith'}, {'fname': 'eve', 'sname': 'jones'}, {'fname': 'eve', 'sname': 'smith'}] + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).count()) - assert gs == [{'COUNT(*)': 2, 'fname': 'bob', 'sname': 'jones'}, {'COUNT(*)': 3, 'fname': 'bob', 'sname': 'smith'}, {'COUNT(*)': 3, 'fname': 'alice', 'sname': 'smith'}, {'COUNT(*)': 1, 'fname': 'eve', 'sname': 'smith'}, {'COUNT(*)': 1, 'fname': 'eve', 'sname': 'jones'}, {'COUNT(*)': 3, 'fname': 'alice', 'sname': 'jones'}], gs + expected = [{'COUNT(*)': 2, 'fname': 'bob', 'sname': 'jones'}, {'COUNT(*)': 3, 'fname': 'bob', 'sname': 'smith'}, {'COUNT(*)': 3, 'fname': 'alice', 'sname': 'smith'}, {'COUNT(*)': 1, 'fname': 'eve', 'sname': 'smith'}, {'COUNT(*)': 1, 'fname': 'eve', 'sname': 'jones'}, {'COUNT(*)': 3, 'fname': 'alice', 'sname': 'jones'}] + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).average('value')) - assert gs == [{'AVG(value)': Decimal('2.0'), 'fname': 'bob', 'sname': 'jones'}, {'AVG(value)': Decimal('1.0'), 'fname': 'bob', 'sname': 'smith'}, {'AVG(value)': Decimal('4.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith'}, {'AVG(value)': Decimal('7.0'), 'fname': 'eve', 'sname': 'smith'}, {'AVG(value)': Decimal('6.0'), 'fname': 'eve', 'sname': 'jones'}, {'AVG(value)': Decimal('3.666666666666666666666666667'), 'fname': 'alice', 'sname': 'jones'}], gs + expected = [{'AVG(value)': Decimal('2.0'), 'fname': 'bob', 'sname': 'jones'}, {'AVG(value)': Decimal('1.0'), 'fname': 'bob', 'sname': 'smith'}, {'AVG(value)': Decimal('4.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith'}, {'AVG(value)': Decimal('7.0'), 'fname': 'eve', 'sname': 'smith'}, {'AVG(value)': Decimal('6.0'), 'fname': 'eve', 'sname': 'jones'}, {'AVG(value)': Decimal('3.666666666666666666666666667'), 'fname': 'alice', 'sname': 'jones'}] + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) + ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).average('cost')) - assert gs == [{'AVG(cost)': Decimal('3.0'), 'fname': 'bob', 'sname': 'jones'}, {'AVG(cost)': Decimal('1.666666666666666666666666667'), 'fname': 'bob', 'sname': 'smith'}, {'AVG(cost)': Decimal('2.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith'}, {'AVG(cost)': Decimal('1.0'), 'fname': 'eve', 'sname': 'smith'}, {'AVG(cost)': Decimal('2.0'), 'fname': 'eve', 'sname': 'jones'}, {'AVG(cost)': Decimal('3.333333333333333333333333333'), 'fname': 'alice', 'sname': 'jones'}], gs + expected = [{'AVG(cost)': Decimal('3.0'), 'fname': 'bob', 'sname': 'jones'}, {'AVG(cost)': Decimal('1.666666666666666666666666667'), 'fname': 'bob', 'sname': 'smith'}, {'AVG(cost)': Decimal('2.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith'}, {'AVG(cost)': Decimal('1.0'), 'fname': 'eve', 'sname': 'smith'}, {'AVG(cost)': Decimal('2.0'), 'fname': 'eve', 'sname': 'jones'}, {'AVG(cost)': Decimal('3.333333333333333333333333333'), 'fname': 'alice', 'sname': 'jones'}] + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).average(('cost', 'value',))) - assert gs == [{'AVG(cost)': Decimal('3'), 'AVG(value)': Decimal('2'), 'fname': 'bob', 'sname': 'jones'}, {'AVG(cost)': Decimal('1.666666666666666666666666667'), 'AVG(value)': Decimal('1'), 'fname': 'bob', 'sname': 'smith'}, {'AVG(cost)': Decimal('2.333333333333333333333333333'), 'AVG(value)': Decimal('4.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith'}, {'AVG(cost)': 1.0, 'AVG(value)': 7.0, 'fname': 'eve', 'sname': 'smith'}, {'AVG(cost)': 2.0, 'AVG(value)': 6.0, 'fname': 'eve', 'sname': 'jones'}, {'AVG(cost)': Decimal('3.333333333333333333333333333'), 'AVG(value)': Decimal('3.666666666666666666666666667'), 'fname': 'alice', 'sname': 'jones'}] + expected = [{'AVG(cost)': Decimal('3'), 'AVG(value)': Decimal('2'), 'fname': 'bob', 'sname': 'jones'}, {'AVG(cost)': Decimal('1.666666666666666666666666667'), 'AVG(value)': Decimal('1'), 'fname': 'bob', 'sname': 'smith'}, {'AVG(cost)': Decimal('2.333333333333333333333333333'), 'AVG(value)': Decimal('4.333333333333333333333333333'), 'fname': 'alice', 'sname': 'smith'}, {'AVG(cost)': 1.0, 'AVG(value)': 7.0, 'fname': 'eve', 'sname': 'smith'}, {'AVG(cost)': 2.0, 'AVG(value)': 6.0, 'fname': 'eve', 'sname': 'jones'}, {'AVG(cost)': Decimal('3.333333333333333333333333333'), 'AVG(value)': Decimal('3.666666666666666666666666667'), 'fname': 'alice', 'sname': 'jones'}] + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) gs = list(GroupBy(ds, ("fname", "sname")).aggregate([('MAX', 'value'),('MIN', 'cost')])) - assert gs == [{'MAX(value)': 2, 'MIN(cost)': 2, 'fname': 'bob', 'sname': 'jones'}, {'MAX(value)': 1, 'MIN(cost)': 1, 'fname': 'bob', 'sname': 'smith'}, {'MAX(value)': 5, 'MIN(cost)': 1, 'fname': 'alice', 'sname': 'smith'}, {'MAX(value)': 7, 'MIN(cost)': 1, 'fname': 'eve', 'sname': 'smith'}, {'MAX(value)': 6, 'MIN(cost)': 2, 'fname': 'eve', 'sname': 'jones'}, {'MAX(value)': 5, 'MIN(cost)': 2, 'fname': 'alice', 'sname': 'jones'}], gs + expected = [{'MAX(value)': 2, 'MIN(cost)': 2, 'fname': 'bob', 'sname': 'jones'}, {'MAX(value)': 1, 'MIN(cost)': 1, 'fname': 'bob', 'sname': 'smith'}, {'MAX(value)': 5, 'MIN(cost)': 1, 'fname': 'alice', 'sname': 'smith'}, {'MAX(value)': 7, 'MIN(cost)': 1, 'fname': 'eve', 'sname': 'smith'}, {'MAX(value)': 6, 'MIN(cost)': 2, 'fname': 'eve', 'sname': 'jones'}, {'MAX(value)': 5, 'MIN(cost)': 2, 'fname': 'alice', 'sname': 'jones'}] + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) # fmt:on @@ -109,15 +123,17 @@ def test_gappy_set(): {"key": 4, "value": None, "plus1": 5}, ] ds = DictSet(data, storage_class=STORAGE_CLASS.MEMORY) - g = list(ds.group_by("value").average("key")) - assert g == [ + gs = list(ds.group_by("value").average("key")) + expected = [ {"AVG(key)": 4.0, "value": None}, {"AVG(key)": 3.0, "value": "two"}, {"AVG(key)": 1.0, "value": "one"}, - ], g + ] + + assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) if __name__ == "__main__": # pragma: no cover - from tests.helpers.runner import run_tests + from helpers.runner import run_tests run_tests() diff --git a/tests/test_reader_cursor.py b/tests/test_reader_cursor.py index e824357..8f81fa4 100644 --- a/tests/test_reader_cursor.py +++ b/tests/test_reader_cursor.py @@ -37,8 +37,6 @@ def test_cursor(): for i in range(number_of_records): data.append({"one": 1, "index": i}) - reader = Reader(inner_reader=NullReader, dataset="none", partitions=None, data=data) - # create random offsets for testing - it's illogical to have a 0 cursor offsets = (entropy.random_range(1, number_of_records) for i in range(20)) @@ -135,7 +133,7 @@ def test_cursor_as_text(): offsets = (entropy.random_range(1, number_of_records - 1) for i in range(5)) for offset in offsets: - cursor = {"location": offset, "map": "00", "partition": 1983839293359648136} + cursor = {"location": offset, "map": "00", "partition": 13429097919052166063} reader = Reader( inner_reader=DiskReader, dataset="tests/data/formats/jsonl", @@ -164,7 +162,7 @@ def test_move_to_cursor(): inner_reader=DiskReader, dataset="tests/data/formats/jsonl", partitions=None, - cursor={"location": offset, "map": "00", "partition": 1983839293359648136}, + cursor={"location": offset, "map": "00", "partition": 13429097919052166063}, persistence=STORAGE_CLASS.NO_PERSISTANCE, ) @@ -229,6 +227,6 @@ def test_multiple_files(): if __name__ == "__main__": # pragma: no cover - from tests.helpers.runner import run_tests + from helpers.runner import run_tests run_tests() diff --git a/tests/test_utils_common.py b/tests/test_utils_common.py index 9e2e835..624519b 100644 --- a/tests/test_utils_common.py +++ b/tests/test_utils_common.py @@ -1,6 +1,4 @@ -""" - -""" +""" """ import os import sys