diff --git a/mabel/adapters/google/google_cloud_storage_writer.py b/mabel/adapters/google/google_cloud_storage_writer.py index f15d32b..dc41e0f 100644 --- a/mabel/adapters/google/google_cloud_storage_writer.py +++ b/mabel/adapters/google/google_cloud_storage_writer.py @@ -46,6 +46,8 @@ def commit(self, byte_data, override_blob_name=None): # name from the path builder if override_blob_name: blob_name = override_blob_name + if blob_name.startswith(self.bucket + "/"): + blob_name = blob_name[len(self.bucket) + 1 :] else: blob_name = self._build_path() diff --git a/mabel/data/internals/group_by.py b/mabel/data/internals/group_by.py index a6ac668..c93667d 100644 --- a/mabel/data/internals/group_by.py +++ b/mabel/data/internals/group_by.py @@ -73,8 +73,7 @@ def _map(self, collect_columns): for record in self._dictset: try: group_key: int = xxh3_64_intdigest( - "".join([str(record[column]) for column in self._columns]), - HASH_SEED + "".join([str(record[column]) for column in self._columns]), HASH_SEED ) except KeyError: group_key: int = xxh3_64_intdigest( diff --git a/mabel/data/writers/internals/blob_writer.py b/mabel/data/writers/internals/blob_writer.py index 2ad72a9..0871a9c 100644 --- a/mabel/data/writers/internals/blob_writer.py +++ b/mabel/data/writers/internals/blob_writer.py @@ -23,7 +23,6 @@ class BlobWriter(object): # this variable outside the __init__. buffer = bytearray() byte_count = 0 - manifest = {} def __init__( self, @@ -141,7 +140,6 @@ def commit(self): if self.records_in_buffer > 0: lock = threading.Lock() - summary = None try: lock.acquire(blocking=True, timeout=10) @@ -155,10 +153,6 @@ def commit(self): ) pytable = self.wal.arrow() - try: - summary = self.wal.profile.to_dicts() - except Exception as e: - print(f"[MABEL] Unable to profile morsel - {type(e).__name__} - {e}") # if we have a schema, make effort to align the parquet file to it if self.schema: @@ -179,7 +173,6 @@ def commit(self): committed_blob_name = self.inner_writer.commit( byte_data=write_buffer, override_blob_name=None ) - self.manifest[committed_blob_name] = summary if "BACKOUT" in committed_blob_name: get_logger().warning( diff --git a/tests/test_data_group_by.py b/tests/test_data_group_by.py index 28013ce..ff5ad0e 100644 --- a/tests/test_data_group_by.py +++ b/tests/test_data_group_by.py @@ -130,7 +130,9 @@ def test_gappy_set(): {"AVG(key)": 1.0, "value": "one"}, ] - assert set(tuple(sorted(d.items())) for d in gs) == set(tuple(sorted(d.items())) for d in expected) + assert set(tuple(sorted(d.items())) for d in gs) == set( + tuple(sorted(d.items())) for d in expected + ) if __name__ == "__main__": # pragma: no cover diff --git a/tests/test_writer_batch_writer.py b/tests/test_writer_batch_writer.py index 117487e..644b4f3 100644 --- a/tests/test_writer_batch_writer.py +++ b/tests/test_writer_batch_writer.py @@ -6,6 +6,7 @@ import pytest sys.path.insert(1, os.path.join(sys.path[0], "..")) + from mabel.adapters.disk import DiskReader, DiskWriter from mabel.data import BatchWriter from mabel.data import Reader @@ -215,7 +216,7 @@ def get_data(): if __name__ == "__main__": # pragma: no cover - from tests.helpers.runner import run_tests + from helpers.runner import run_tests test_writer_without_schema_parquet() run_tests()