Skip to content

Commit ed77040

Browse files
bitnerlossyrobmmcfarland
authored
Bug fixes for 0.5.1 (#106)
* Remove Stable declaration from delete_items function * Python dehydrate marks top-level base_keys as DNM Fixes and tests that dehydration is aware of top-level keys on the base item which should be marked as do-not-merge on the dehydrated item. * Fix test class to use hydration module * update changelog * update tests for changes in fields extension, add simpler testing for sql queries * update version to 0.6.0, update changelog * add triggers to allow deleting collections and cleaning up partitions Co-authored-by: Rob Emanuele <[email protected]> Co-authored-by: Matt McFarland <[email protected]>
1 parent 98ac920 commit ed77040

34 files changed

+4998
-846
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,19 @@
11
# Changelog
2+
## [v0.6.0]
3+
4+
### Fixed
5+
- Fix function signatures for transactional functions (delete_item etc) to make sure that they are marked as volatile
6+
- Fix function for getting start/end dates from a stac item
7+
### Changed
8+
- Update hydration/dehydration logic to make sure that it matches hydration/dehydration in pypgstac
9+
- Update fields logic in pgstac to only use full paths and to match logic in stac-fastapi
10+
- Always include id and collection on features regardless of fields setting
11+
### Added
12+
- Add tests to ensure that pgstac and pypgstac hydration logic is equivalent
13+
- Add conf item to search to allow returning results without hydrating. This allows an application using pgstac to shift the CPU load of rehydrating items from the database onto the application server.
14+
- Add "--dehydrated" option to loader to be able to load a dehydrated file (or iterable) of items such as would be output using pg_dump or postgresql copy.
15+
- Add "--chunksize" option to loader that can split the processing of an iterable or file into chunks of n records at a time
16+
217
## [v0.5.1]
318

419
### Fixed

pypgstac/pypgstac/db.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -251,11 +251,15 @@ def func(self, function_name: str, *args: Any) -> Generator:
251251
"""Call a database function."""
252252
placeholders = sql.SQL(", ").join(sql.Placeholder() * len(args))
253253
func = sql.Identifier(function_name)
254+
cleaned_args = []
255+
for arg in args:
256+
if isinstance(arg, dict):
257+
cleaned_args.append(psycopg.types.json.Jsonb(arg))
258+
else:
259+
cleaned_args.append(arg)
254260
base_query = sql.SQL("SELECT * FROM {}({});").format(func, placeholders)
255-
return self.query(base_query, *args)
261+
return self.query(base_query, cleaned_args)
256262

257263
def search(self, query: Union[dict, str, psycopg.types.json.Jsonb] = "{}") -> str:
258264
"""Search PgStac."""
259-
if isinstance(query, dict):
260-
query = psycopg.types.json.Jsonb(query)
261265
return dumps(next(self.func("search", query))[0])

pypgstac/pypgstac/hydration.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
"""Hydrate data in pypgstac rather than on the database."""
12
from copy import deepcopy
23
from typing import Any, Dict
34

@@ -11,7 +12,6 @@ def hydrate(base_item: Dict[str, Any], item: Dict[str, Any]) -> Dict[str, Any]:
1112
This will not perform a deep copy; values of the original item will be referenced
1213
in the return item.
1314
"""
14-
1515
# Merge will mutate i, but create deep copies of values in the base item
1616
# This will prevent the base item values from being mutated, e.g. by
1717
# filtering out fields in `filter_fields`.
@@ -103,6 +103,10 @@ def strip(base_value: Dict[str, Any], item_value: Dict[str, Any]) -> Dict[str, A
103103
else:
104104
# Unequal non-dict values are copied over from the incoming item
105105
out[key] = value
106+
107+
# Mark any top-level keys from the base_item that are not in the incoming item
108+
apply_marked_keys(base_value, item_value, out)
109+
106110
return out
107111

108112
return strip(base_item, full_item)
@@ -113,13 +117,17 @@ def apply_marked_keys(
113117
full_item: Dict[str, Any],
114118
dehydrated: Dict[str, Any],
115119
) -> None:
116-
"""
120+
"""Mark keys.
121+
117122
Mark any keys that are present on the base item but not in the incoming item
118123
as `do-not-merge` on the dehydrated item. This will prevent they key from
119124
being rehydrated.
120125
121126
This modifies the dehydrated item in-place.
122127
"""
123-
marked_keys = [key for key in base_item if key not in full_item.keys()]
124-
marked_dict = {k: DO_NOT_MERGE_MARKER for k in marked_keys}
125-
dehydrated.update(marked_dict)
128+
try:
129+
marked_keys = [key for key in base_item if key not in full_item.keys()]
130+
marked_dict = {k: DO_NOT_MERGE_MARKER for k in marked_keys}
131+
dehydrated.update(marked_dict)
132+
except TypeError:
133+
pass

pypgstac/pypgstac/load.py

Lines changed: 119 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,13 @@
1313
Dict,
1414
Iterable,
1515
Iterator,
16-
List,
1716
Optional,
1817
Tuple,
1918
Union,
2019
Generator,
2120
TextIO,
2221
)
23-
22+
import csv
2423
import orjson
2524
import psycopg
2625
from orjson import JSONDecodeError
@@ -42,6 +41,16 @@
4241
logger = logging.getLogger(__name__)
4342

4443

44+
def chunked_iterable(iterable: Iterable, size: Optional[int] = 10000) -> Iterable:
45+
"""Chunk an iterable."""
46+
it = iter(iterable)
47+
while True:
48+
chunk = tuple(itertools.islice(it, size))
49+
if not chunk:
50+
break
51+
yield chunk
52+
53+
4554
class Tables(str, Enum):
4655
"""Available tables for loading."""
4756

@@ -133,6 +142,7 @@ class Loader:
133142
"""Utilities for loading data."""
134143

135144
db: PgstacDB
145+
_partition_cache: Optional[dict] = None
136146

137147
@lru_cache
138148
def collection_json(self, collection_id: str) -> Tuple[dict, int, str]:
@@ -149,6 +159,7 @@ def collection_json(self, collection_id: str) -> Tuple[dict, int, str]:
149159
raise Exception(
150160
f"Collection {collection_id} is not present in the database"
151161
)
162+
logger.debug(f"Found {collection_id} with base_item {base_item}")
152163
return base_item, key, partition_trunc
153164

154165
def load_collections(
@@ -270,7 +281,16 @@ def load_partition(
270281
) as copy:
271282
for item in items:
272283
item.pop("partition")
273-
copy.write_row(list(item.values()))
284+
copy.write_row(
285+
(
286+
item["id"],
287+
item["collection"],
288+
item["datetime"],
289+
item["end_datetime"],
290+
item["geometry"],
291+
item["content"],
292+
)
293+
)
274294
logger.debug(cur.statusmessage)
275295
logger.debug(f"Rows affected: {cur.rowcount}")
276296
elif insert_mode in (
@@ -295,7 +315,16 @@ def load_partition(
295315
) as copy:
296316
for item in items:
297317
item.pop("partition")
298-
copy.write_row(list(item.values()))
318+
copy.write_row(
319+
(
320+
item["id"],
321+
item["collection"],
322+
item["datetime"],
323+
item["end_datetime"],
324+
item["geometry"],
325+
item["content"],
326+
)
327+
)
299328
logger.debug(cur.statusmessage)
300329
logger.debug(f"Copied rows: {cur.rowcount}")
301330

@@ -369,61 +398,102 @@ def load_partition(
369398
f"Copying data for {partition} took {time.perf_counter() - t} seconds"
370399
)
371400

401+
def _partition_update(self, item: dict) -> str:
402+
403+
p = item.get("partition", None)
404+
if p is None:
405+
_, key, partition_trunc = self.collection_json(item["collection"])
406+
if partition_trunc == "year":
407+
pd = item["datetime"].replace("-", "")[:4]
408+
p = f"_items_{key}_{pd}"
409+
elif partition_trunc == "month":
410+
pd = item["datetime"].replace("-", "")[:6]
411+
p = f"_items_{key}_{pd}"
412+
else:
413+
p = f"_items_{key}"
414+
item["partition"] = p
415+
416+
if self._partition_cache is None:
417+
self._partition_cache = {}
418+
419+
partition = self._partition_cache.get(
420+
item["partition"],
421+
{
422+
"partition": None,
423+
"collection": None,
424+
"mindt": None,
425+
"maxdt": None,
426+
"minedt": None,
427+
"maxedt": None,
428+
},
429+
)
430+
431+
partition["partition"] = item["partition"]
432+
partition["collection"] = item["collection"]
433+
if partition["mindt"] is None or item["datetime"] < partition["mindt"]:
434+
partition["mindt"] = item["datetime"]
435+
436+
if partition["maxdt"] is None or item["datetime"] > partition["maxdt"]:
437+
partition["maxdt"] = item["datetime"]
438+
439+
if partition["minedt"] is None or item["end_datetime"] < partition["minedt"]:
440+
partition["minedt"] = item["end_datetime"]
441+
442+
if partition["maxedt"] is None or item["end_datetime"] > partition["maxedt"]:
443+
partition["maxedt"] = item["end_datetime"]
444+
self._partition_cache[item["partition"]] = partition
445+
446+
return p
447+
448+
def read_dehydrated(self, file: Union[Path, str] = "stdin") -> Generator:
449+
if file is None:
450+
file = "stdin"
451+
if isinstance(file, str):
452+
open_file: Any = open_std(file, "r")
453+
with open_file as f:
454+
fields = [
455+
"id",
456+
"geometry",
457+
"collection",
458+
"datetime",
459+
"end_datetime",
460+
"content",
461+
]
462+
csvreader = csv.DictReader(f, fields, delimiter="\t")
463+
for item in csvreader:
464+
item["partition"] = self._partition_update(item)
465+
yield item
466+
467+
def read_hydrated(
468+
self, file: Union[Path, str, Iterator[Any]] = "stdin"
469+
) -> Generator:
470+
for line in read_json(file):
471+
item = self.format_item(line)
472+
item["partition"] = self._partition_update(item)
473+
yield item
474+
372475
def load_items(
373476
self,
374477
file: Union[Path, str, Iterator[Any]] = "stdin",
375478
insert_mode: Optional[Methods] = Methods.insert,
479+
dehydrated: Optional[bool] = False,
480+
chunksize: Optional[int] = 10000,
376481
) -> None:
377482
"""Load items json records."""
378483
if file is None:
379484
file = "stdin"
380485
t = time.perf_counter()
381-
items: List = []
382-
partitions: dict = {}
383-
for line in read_json(file):
384-
item = self.format_item(line)
385-
items.append(item)
386-
partition = partitions.get(
387-
item["partition"],
388-
{
389-
"partition": None,
390-
"collection": None,
391-
"mindt": None,
392-
"maxdt": None,
393-
"minedt": None,
394-
"maxedt": None,
395-
},
396-
)
397-
partition["partition"] = item["partition"]
398-
partition["collection"] = item["collection"]
399-
if partition["mindt"] is None or item["datetime"] < partition["mindt"]:
400-
partition["mindt"] = item["datetime"]
401-
402-
if partition["maxdt"] is None or item["datetime"] > partition["maxdt"]:
403-
partition["maxdt"] = item["datetime"]
404-
405-
if (
406-
partition["minedt"] is None
407-
or item["end_datetime"] < partition["minedt"]
408-
):
409-
partition["minedt"] = item["end_datetime"]
410-
411-
if (
412-
partition["maxedt"] is None
413-
or item["end_datetime"] > partition["maxedt"]
414-
):
415-
partition["maxedt"] = item["end_datetime"]
416-
partitions[item["partition"]] = partition
417-
logger.debug(
418-
f"Loading and parsing data took {time.perf_counter() - t} seconds."
419-
)
420-
t = time.perf_counter()
421-
items.sort(key=lambda x: x["partition"])
422-
logger.debug(f"Sorting data took {time.perf_counter() - t} seconds.")
423-
t = time.perf_counter()
486+
self._partition_cache = {}
487+
488+
if dehydrated and isinstance(file, str):
489+
items = self.read_dehydrated(file)
490+
else:
491+
items = self.read_hydrated(file)
424492

425-
for k, g in itertools.groupby(items, lambda x: x["partition"]):
426-
self.load_partition(partitions[k], g, insert_mode)
493+
for chunk in chunked_iterable(items, chunksize):
494+
list(chunk).sort(key=lambda x: x["partition"])
495+
for k, g in itertools.groupby(chunk, lambda x: x["partition"]):
496+
self.load_partition(self._partition_cache[k], g, insert_mode)
427497

428498
logger.debug(f"Adding data to database took {time.perf_counter() - t} seconds.")
429499

0 commit comments

Comments
 (0)