Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions packages/bigframes/tests/data/nested_structs.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
{"id": 1, "person": {"name": "Alice", "age":30, "address": {"city": "New York", "country": "USA"}}}
{"id": 2, "person": {"name": "Bob", "age":25, "address": {"city": "London", "country": "UK"}}}
{"id": 1, "person": {"name": "Alice", "age": 30, "address": {"city": "New York", "country": "USA"}}, "bool_col": true, "int64_col": "123456789", "float64_col": 1.25, "string_col": "Hello World", "json_col": {"a": 1, "b": [1, 2]}, "date_col": "2026-06-24", "time_col": "12:34:56.789012", "datetime_col": "2026-06-24 12:34:56.789012", "timestamp_col": "2026-06-24T12:34:56.789012Z", "bytes_col": "SGVsbG8=", "numeric_col": "123456.789", "bignumeric_col": "123456.7890123456789", "geography_col": "POINT(30 10)", "duration_col": "1000"}
{"id": 2, "person": {"name": "", "age": -1, "address": {"city": "", "country": ""}}, "bool_col": false, "int64_col": "-9223372036854775808", "float64_col": "-Infinity", "string_col": "", "json_col": {}, "date_col": "0001-01-01", "time_col": "00:00:00", "datetime_col": "0001-01-02 00:00:00", "timestamp_col": "0001-01-02T00:00:00Z", "bytes_col": "", "numeric_col": "-99999999999999999999999999999.999999999", "bignumeric_col": "-99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POINT(0 0)", "duration_col": "-9223372036854775"}
{"id": 3, "person": {"name": "Very Long Name...", "age": 150, "address": {"city": "City", "country": "Country"}}, "bool_col": true, "int64_col": "9223372036854775807", "float64_col": "Infinity", "string_col": "Unicode: 🚀 Spark ✨", "json_col": {"max": true, "nested": {"val": 999}}, "date_col": "9999-12-31", "time_col": "23:59:59.999999", "datetime_col": "9999-12-31 23:59:59.999999", "timestamp_col": "9999-12-31T23:59:59.999999Z", "bytes_col": "dmVyeSBsb25nIGJ5dGVzIHZhbHVl", "numeric_col": "99999999999999999999999999999.999999999", "bignumeric_col": "99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))", "duration_col": "9223372036854775"}
{"id": 4, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null}
{"id": 5, "person": {"name": "Bob", "age": 0, "address": null}, "bool_col": false, "int64_col": "0", "float64_col": "NaN", "string_col": "Line 1\nLine 2\n\"Quotes\"", "json_col": [1, "two", null], "date_col": "1970-01-01", "time_col": "12:00:00", "datetime_col": "1970-01-01 12:00:00", "timestamp_col": "1970-01-01T12:00:00Z", "bytes_col": "AA==", "numeric_col": "0", "bignumeric_col": "0", "geography_col": "LINESTRING(0 0, 1 1, 2 2)", "duration_col": "0"}
{"id": 6, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "json_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null}
73 changes: 73 additions & 0 deletions packages/bigframes/tests/data/nested_structs_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
{
"name": "person",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "name",
Expand All @@ -21,6 +22,7 @@
{
"name": "address",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "city",
Expand All @@ -35,5 +37,76 @@
]
}
]
},
{
"name": "bool_col",
"type": "BOOLEAN",
"mode": "NULLABLE"
},
{
"name": "int64_col",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "float64_col",
"type": "FLOAT",
"mode": "NULLABLE"
},
{
"name": "string_col",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "json_col",
"type": "JSON",
"mode": "NULLABLE"
},
{
"name": "date_col",
"type": "DATE",
"mode": "NULLABLE"
},
{
"name": "time_col",
"type": "TIME",
"mode": "NULLABLE"
},
{
"name": "datetime_col",
"type": "DATETIME",
"mode": "NULLABLE"
},
{
"name": "timestamp_col",
"type": "TIMESTAMP",
"mode": "NULLABLE"
},
{
"name": "bytes_col",
"type": "BYTES",
"mode": "NULLABLE"
},
{
"name": "numeric_col",
"type": "NUMERIC",
"mode": "NULLABLE"
},
{
"name": "bignumeric_col",
"type": "BIGNUMERIC",
"mode": "NULLABLE"
},
{
"name": "geography_col",
"type": "GEOGRAPHY",
"mode": "NULLABLE"
},
{
"name": "duration_col",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "#microseconds"
}
]
241 changes: 224 additions & 17 deletions packages/bigframes/tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import hashlib
import logging
import math
import pathlib
import textwrap
import traceback
import typing
from datetime import datetime
from typing import Dict, Generator, Optional

import fsspec # type: ignore[import-untyped]
Expand All @@ -34,6 +34,7 @@
import google.cloud.storage as storage # type: ignore
import numpy as np
import pandas as pd
import pandas.arrays
import pyarrow as pa
import pytest
import pytz
Expand Down Expand Up @@ -496,14 +497,220 @@ def nested_structs_df(

@pytest.fixture(scope="session")
def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.DataFrame:
"""pd.DataFrame pointing at test data."""
"""pd.DataFrame pointing at test data.

Manually parses using json.loads to preserve data types.
"""
import base64
import decimal
import json

import db_dtypes # type: ignore[import-untyped]
import geopandas as gpd # type: ignore[import-untyped]

with open(DATA_DIR / "nested_structs.jsonl") as f:
raw_rows = [json.loads(line) for line in f]

ids = [row["id"] for row in raw_rows]

def get_val(row, col_name):
return row.get(col_name)

# person
person_struct_schema = nested_structs_pandas_type.pyarrow_dtype
processed_person: list[Optional[dict[str, typing.Any]]] = []
for row in raw_rows:
x = get_val(row, "person")
if x is None:
processed_person.append(None)
else:
d = dict(x)
if "age" in d and d["age"] is not None:
d["age"] = int(d["age"])
processed_person.append(d)
person_arr = pa.array(processed_person, type=person_struct_schema)
person_ser = pd.Series(person_arr, index=ids, dtype=nested_structs_pandas_type)

# bool_col
bool_vals = [
bool(get_val(row, "bool_col")) if get_val(row, "bool_col") is not None else None
for row in raw_rows
]
bool_ser = pd.Series(bool_vals, index=ids, dtype=pd.BooleanDtype())

# int64_col
int64_vals = [
int(get_val(row, "int64_col"))
if get_val(row, "int64_col") is not None
else None
for row in raw_rows
]
int64_ser = pd.Series(int64_vals, index=ids, dtype=pd.Int64Dtype())

# float64_col
float64_vals = [
float(get_val(row, "float64_col"))
if get_val(row, "float64_col") is not None
else None
for row in raw_rows
]
np_vals = np.array(
[x if x is not None else np.nan for x in float64_vals], dtype=np.float64
)
mask = np.array([x is None for x in float64_vals], dtype=bool)
float64_arr = pd.arrays.FloatingArray(np_vals, mask) # type: ignore
float64_ser = pd.Series(float64_arr, index=ids)

# string_col
string_vals = [
str(get_val(row, "string_col"))
if get_val(row, "string_col") is not None
else None
for row in raw_rows
]
string_ser = pd.Series(
string_vals, index=ids, dtype=pd.StringDtype(storage="pyarrow")
)

df = pd.read_json(
DATA_DIR / "nested_structs.jsonl",
lines=True,
# json_col
json_strs: list[Optional[str]] = []
for row in raw_rows:
if "json_col" not in row:
json_strs.append(None)
elif row["json_col"] is None:
json_strs.append("null")
else:
json_strs.append(
json.dumps(row["json_col"], sort_keys=True, separators=(",", ":"))
)
json_arr = pa.array(json_strs, type=db_dtypes.JSONArrowType())
json_ser = pd.Series(
json_arr, index=ids, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType())
)

# date_col
date_vals = [
datetime.date.fromisoformat(get_val(row, "date_col"))
if get_val(row, "date_col") is not None
else None
for row in raw_rows
]
date_arr = pa.array(date_vals, type=pa.date32())
date_ser = pd.Series(date_arr, index=ids, dtype=pd.ArrowDtype(pa.date32()))

# time_col
time_vals = [
datetime.time.fromisoformat(get_val(row, "time_col"))
if get_val(row, "time_col") is not None
else None
for row in raw_rows
]
time_arr = pa.array(time_vals, type=pa.time64("us"))
time_ser = pd.Series(time_arr, index=ids, dtype=pd.ArrowDtype(pa.time64("us")))

# datetime_col
datetime_vals: list[Optional[datetime.datetime]] = []
for row in raw_rows:
val = get_val(row, "datetime_col")
if val is None:
datetime_vals.append(None)
else:
datetime_vals.append(datetime.datetime.fromisoformat(val.replace(" ", "T")))
datetime_arr = pa.array(datetime_vals, type=pa.timestamp("us"))
datetime_ser = pd.Series(
datetime_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us"))
)
df = df.set_index("id")
df["person"] = df["person"].astype(nested_structs_pandas_type)

# timestamp_col
timestamp_vals = [
datetime.datetime.fromisoformat(
get_val(row, "timestamp_col").replace("Z", "+00:00")
)
if get_val(row, "timestamp_col") is not None
else None
for row in raw_rows
]
Comment thread
tswast marked this conversation as resolved.
timestamp_arr = pa.array(timestamp_vals, type=pa.timestamp("us", tz="UTC"))
timestamp_ser = pd.Series(
timestamp_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
)

# bytes_col
bytes_vals: list[Optional[bytes]] = []
for row in raw_rows:
val = get_val(row, "bytes_col")
if val is None:
bytes_vals.append(None)
elif val == "":
bytes_vals.append(b"")
else:
bytes_vals.append(base64.b64decode(val))
bytes_arr = pa.array(bytes_vals, type=pa.binary())
bytes_ser = pd.Series(bytes_arr, index=ids, dtype=pd.ArrowDtype(pa.binary()))

# numeric_col
numeric_vals = [
decimal.Decimal(str(get_val(row, "numeric_col")))
if get_val(row, "numeric_col") is not None
else None
for row in raw_rows
]
numeric_arr = pa.array(numeric_vals, type=pa.decimal128(38, 9))
numeric_ser = pd.Series(
numeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal128(38, 9))
)

# bignumeric_col
bignumeric_vals = [
decimal.Decimal(str(get_val(row, "bignumeric_col")))
if get_val(row, "bignumeric_col") is not None
else None
for row in raw_rows
]
bignumeric_arr = pa.array(bignumeric_vals, type=pa.decimal256(76, 38))
bignumeric_ser = pd.Series(
bignumeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal256(76, 38))
)

# geography_col
geo_vals = [get_val(row, "geography_col") for row in raw_rows]
geo_ser = gpd.GeoSeries.from_wkt(geo_vals)
geo_ser.index = ids

# duration_col
duration_vals = [
int(get_val(row, "duration_col"))
if get_val(row, "duration_col") is not None
else None
for row in raw_rows
]
duration_arr = pa.array(duration_vals, type=pa.duration("us"))
duration_ser = pd.Series(
duration_arr, index=ids, dtype=pd.ArrowDtype(pa.duration("us"))
)

df = pd.DataFrame(
{
"person": person_ser,
"bool_col": bool_ser,
"int64_col": int64_ser,
"float64_col": float64_ser,
"string_col": string_ser,
"json_col": json_ser,
"date_col": date_ser,
"time_col": time_ser,
"datetime_col": datetime_ser,
"timestamp_col": timestamp_ser,
"bytes_col": bytes_ser,
"numeric_col": numeric_ser,
"bignumeric_col": bignumeric_ser,
"geography_col": geo_ser,
"duration_col": duration_ser,
},
index=ids,
)
df.index.name = "id"

return df


Expand Down Expand Up @@ -834,9 +1041,9 @@ def new_time_series_pandas_df():
return pd.DataFrame(
{
"parsed_date": [
datetime(2017, 8, 2, tzinfo=utc),
datetime(2017, 8, 3, tzinfo=utc),
datetime(2017, 8, 4, tzinfo=utc),
datetime.datetime(2017, 8, 2, tzinfo=utc),
datetime.datetime(2017, 8, 3, tzinfo=utc),
datetime.datetime(2017, 8, 4, tzinfo=utc),
],
"total_visits": [2500, 2500, 2500],
}
Expand All @@ -855,12 +1062,12 @@ def new_time_series_pandas_df_w_id():
return pd.DataFrame(
{
"parsed_date": [
datetime(2017, 8, 2, tzinfo=utc),
datetime(2017, 8, 2, tzinfo=utc),
datetime(2017, 8, 3, tzinfo=utc),
datetime(2017, 8, 3, tzinfo=utc),
datetime(2017, 8, 4, tzinfo=utc),
datetime(2017, 8, 4, tzinfo=utc),
datetime.datetime(2017, 8, 2, tzinfo=utc),
datetime.datetime(2017, 8, 2, tzinfo=utc),
datetime.datetime(2017, 8, 3, tzinfo=utc),
datetime.datetime(2017, 8, 3, tzinfo=utc),
datetime.datetime(2017, 8, 4, tzinfo=utc),
datetime.datetime(2017, 8, 4, tzinfo=utc),
],
"id": ["1", "2", "1", "2", "1", "2"],
"total_visits": [2500, 2500, 2500, 2500, 2500, 2500],
Expand Down Expand Up @@ -1473,7 +1680,7 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent
continue

# Ignore the functions less than one day old
age = datetime.now() - datetime.fromtimestamp(
age = datetime.datetime.now() - datetime.datetime.fromtimestamp(
cloud_function.update_time.timestamp()
)
if age.days <= 0:
Expand Down
Loading
Loading