googleapis · tswast · Jun 24, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
@@ -1,2 +1,6 @@
-{"id": 1,   "person": {"name": "Alice", "age":30,   "address": {"city": "New York", "country": "USA"}}}
-{"id": 2,   "person": {"name": "Bob",   "age":25,   "address": {"city": "London",   "country": "UK"}}}
+{"id": 1, "person": {"name": "Alice", "age": 30, "address": {"city": "New York", "country": "USA"}}, "bool_col": true, "int64_col": "123456789", "float64_col": 1.25, "string_col": "Hello World", "json_col": {"a": 1, "b": [1, 2]}, "date_col": "2026-06-24", "time_col": "12:34:56.789012", "datetime_col": "2026-06-24 12:34:56.789012", "timestamp_col": "2026-06-24T12:34:56.789012Z", "bytes_col": "SGVsbG8=", "numeric_col": "123456.789", "bignumeric_col": "123456.7890123456789", "geography_col": "POINT(30 10)", "duration_col": "1000"}
+{"id": 2, "person": {"name": "", "age": -1, "address": {"city": "", "country": ""}}, "bool_col": false, "int64_col": "-9223372036854775808", "float64_col": "-Infinity", "string_col": "", "json_col": {}, "date_col": "0001-01-01", "time_col": "00:00:00", "datetime_col": "0001-01-02 00:00:00", "timestamp_col": "0001-01-02T00:00:00Z", "bytes_col": "", "numeric_col": "-99999999999999999999999999999.999999999", "bignumeric_col": "-99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POINT(0 0)", "duration_col": "-9223372036854775"}
+{"id": 3, "person": {"name": "Very Long Name...", "age": 150, "address": {"city": "City", "country": "Country"}}, "bool_col": true, "int64_col": "9223372036854775807", "float64_col": "Infinity", "string_col": "Unicode: 🚀 Spark ✨", "json_col": {"max": true, "nested": {"val": 999}}, "date_col": "9999-12-31", "time_col": "23:59:59.999999", "datetime_col": "9999-12-31 23:59:59.999999", "timestamp_col": "9999-12-31T23:59:59.999999Z", "bytes_col": "dmVyeSBsb25nIGJ5dGVzIHZhbHVl", "numeric_col": "99999999999999999999999999999.999999999", "bignumeric_col": "99999999999999999999999999999999999999.99999999999999999999999999999999999999", "geography_col": "POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))", "duration_col": "9223372036854775"}
+{"id": 4, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null}
+{"id": 5, "person": {"name": "Bob", "age": 0, "address": null}, "bool_col": false, "int64_col": "0", "float64_col": "NaN", "string_col": "Line 1\nLine 2\n\"Quotes\"", "json_col": [1, "two", null], "date_col": "1970-01-01", "time_col": "12:00:00", "datetime_col": "1970-01-01 12:00:00", "timestamp_col": "1970-01-01T12:00:00Z", "bytes_col": "AA==", "numeric_col": "0", "bignumeric_col": "0", "geography_col": "LINESTRING(0 0, 1 1, 2 2)", "duration_col": "0"}
+{"id": 6, "person": null, "bool_col": null, "int64_col": null, "float64_col": null, "string_col": null, "json_col": null, "date_col": null, "time_col": null, "datetime_col": null, "timestamp_col": null, "bytes_col": null, "numeric_col": null, "bignumeric_col": null, "geography_col": null, "duration_col": null}
@@ -7,6 +7,7 @@
     {
         "name": "person",
         "type": "RECORD",
+        "mode": "NULLABLE",
         "fields": [
             {
                 "name": "name",
@@ -21,6 +22,7 @@
             {
                 "name": "address",
                 "type": "RECORD",
+                "mode": "NULLABLE",
                 "fields": [
                     {
                         "name": "city",
@@ -35,5 +37,76 @@
                 ]
             }
         ]
+    },
+    {
+        "name": "bool_col",
+        "type": "BOOLEAN",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "int64_col",
+        "type": "INTEGER",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "float64_col",
+        "type": "FLOAT",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "string_col",
+        "type": "STRING",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "json_col",
+        "type": "JSON",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "date_col",
+        "type": "DATE",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "time_col",
+        "type": "TIME",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "datetime_col",
+        "type": "DATETIME",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "timestamp_col",
+        "type": "TIMESTAMP",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "bytes_col",
+        "type": "BYTES",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "numeric_col",
+        "type": "NUMERIC",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "bignumeric_col",
+        "type": "BIGNUMERIC",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "geography_col",
+        "type": "GEOGRAPHY",
+        "mode": "NULLABLE"
+    },
+    {
+        "name": "duration_col",
+        "type": "INTEGER",
+        "mode": "NULLABLE",
+        "description": "#microseconds"
     }
 ]
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
 import hashlib
 import logging
 import math
 import pathlib
 import textwrap
 import traceback
 import typing
-from datetime import datetime
 from typing import Dict, Generator, Optional
 
 import fsspec  # type: ignore[import-untyped]
@@ -34,6 +34,7 @@
 import google.cloud.storage as storage  # type: ignore
 import numpy as np
 import pandas as pd
+import pandas.arrays
 import pyarrow as pa
 import pytest
 import pytz
@@ -496,14 +497,220 @@ def nested_structs_df(
 
 @pytest.fixture(scope="session")
 def nested_structs_pandas_df(nested_structs_pandas_type: pd.ArrowDtype) -> pd.DataFrame:
-    """pd.DataFrame pointing at test data."""
+    """pd.DataFrame pointing at test data.
+
+    Manually parses using json.loads to preserve data types.
+    """
+    import base64
+    import decimal
+    import json
+
+    import db_dtypes  # type: ignore[import-untyped]
+    import geopandas as gpd  # type: ignore[import-untyped]
+
+    with open(DATA_DIR / "nested_structs.jsonl") as f:
+        raw_rows = [json.loads(line) for line in f]
+
+    ids = [row["id"] for row in raw_rows]
+
+    def get_val(row, col_name):
+        return row.get(col_name)
+
+    # person
+    person_struct_schema = nested_structs_pandas_type.pyarrow_dtype
+    processed_person: list[Optional[dict[str, typing.Any]]] = []
+    for row in raw_rows:
+        x = get_val(row, "person")
+        if x is None:
+            processed_person.append(None)
+        else:
+            d = dict(x)
+            if "age" in d and d["age"] is not None:
+                d["age"] = int(d["age"])
+            processed_person.append(d)
+    person_arr = pa.array(processed_person, type=person_struct_schema)
+    person_ser = pd.Series(person_arr, index=ids, dtype=nested_structs_pandas_type)
+
+    # bool_col
+    bool_vals = [
+        bool(get_val(row, "bool_col")) if get_val(row, "bool_col") is not None else None
+        for row in raw_rows
+    ]
+    bool_ser = pd.Series(bool_vals, index=ids, dtype=pd.BooleanDtype())
+
+    # int64_col
+    int64_vals = [
+        int(get_val(row, "int64_col"))
+        if get_val(row, "int64_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    int64_ser = pd.Series(int64_vals, index=ids, dtype=pd.Int64Dtype())
+
+    # float64_col
+    float64_vals = [
+        float(get_val(row, "float64_col"))
+        if get_val(row, "float64_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    np_vals = np.array(
+        [x if x is not None else np.nan for x in float64_vals], dtype=np.float64
+    )
+    mask = np.array([x is None for x in float64_vals], dtype=bool)
+    float64_arr = pd.arrays.FloatingArray(np_vals, mask)  # type: ignore
+    float64_ser = pd.Series(float64_arr, index=ids)
+
+    # string_col
+    string_vals = [
+        str(get_val(row, "string_col"))
+        if get_val(row, "string_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    string_ser = pd.Series(
+        string_vals, index=ids, dtype=pd.StringDtype(storage="pyarrow")
+    )
 
-    df = pd.read_json(
-        DATA_DIR / "nested_structs.jsonl",
-        lines=True,
+    # json_col
+    json_strs: list[Optional[str]] = []
+    for row in raw_rows:
+        if "json_col" not in row:
+            json_strs.append(None)
+        elif row["json_col"] is None:
+            json_strs.append("null")
+        else:
+            json_strs.append(
+                json.dumps(row["json_col"], sort_keys=True, separators=(",", ":"))
+            )
+    json_arr = pa.array(json_strs, type=db_dtypes.JSONArrowType())
+    json_ser = pd.Series(
+        json_arr, index=ids, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType())
+    )
+
+    # date_col
+    date_vals = [
+        datetime.date.fromisoformat(get_val(row, "date_col"))
+        if get_val(row, "date_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    date_arr = pa.array(date_vals, type=pa.date32())
+    date_ser = pd.Series(date_arr, index=ids, dtype=pd.ArrowDtype(pa.date32()))
+
+    # time_col
+    time_vals = [
+        datetime.time.fromisoformat(get_val(row, "time_col"))
+        if get_val(row, "time_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    time_arr = pa.array(time_vals, type=pa.time64("us"))
+    time_ser = pd.Series(time_arr, index=ids, dtype=pd.ArrowDtype(pa.time64("us")))
+
+    # datetime_col
+    datetime_vals: list[Optional[datetime.datetime]] = []
+    for row in raw_rows:
+        val = get_val(row, "datetime_col")
+        if val is None:
+            datetime_vals.append(None)
+        else:
+            datetime_vals.append(datetime.datetime.fromisoformat(val.replace(" ", "T")))
+    datetime_arr = pa.array(datetime_vals, type=pa.timestamp("us"))
+    datetime_ser = pd.Series(
+        datetime_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us"))
     )
-    df = df.set_index("id")
-    df["person"] = df["person"].astype(nested_structs_pandas_type)
+
+    # timestamp_col
+    timestamp_vals = [
+        datetime.datetime.fromisoformat(
+            get_val(row, "timestamp_col").replace("Z", "+00:00")
+        )
+        if get_val(row, "timestamp_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    timestamp_arr = pa.array(timestamp_vals, type=pa.timestamp("us", tz="UTC"))
+    timestamp_ser = pd.Series(
+        timestamp_arr, index=ids, dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
+    )
+
+    # bytes_col
+    bytes_vals: list[Optional[bytes]] = []
+    for row in raw_rows:
+        val = get_val(row, "bytes_col")
+        if val is None:
+            bytes_vals.append(None)
+        elif val == "":
+            bytes_vals.append(b"")
+        else:
+            bytes_vals.append(base64.b64decode(val))
+    bytes_arr = pa.array(bytes_vals, type=pa.binary())
+    bytes_ser = pd.Series(bytes_arr, index=ids, dtype=pd.ArrowDtype(pa.binary()))
+
+    # numeric_col
+    numeric_vals = [
+        decimal.Decimal(str(get_val(row, "numeric_col")))
+        if get_val(row, "numeric_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    numeric_arr = pa.array(numeric_vals, type=pa.decimal128(38, 9))
+    numeric_ser = pd.Series(
+        numeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal128(38, 9))
+    )
+
+    # bignumeric_col
+    bignumeric_vals = [
+        decimal.Decimal(str(get_val(row, "bignumeric_col")))
+        if get_val(row, "bignumeric_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    bignumeric_arr = pa.array(bignumeric_vals, type=pa.decimal256(76, 38))
+    bignumeric_ser = pd.Series(
+        bignumeric_arr, index=ids, dtype=pd.ArrowDtype(pa.decimal256(76, 38))
+    )
+
+    # geography_col
+    geo_vals = [get_val(row, "geography_col") for row in raw_rows]
+    geo_ser = gpd.GeoSeries.from_wkt(geo_vals)
+    geo_ser.index = ids
+
+    # duration_col
+    duration_vals = [
+        int(get_val(row, "duration_col"))
+        if get_val(row, "duration_col") is not None
+        else None
+        for row in raw_rows
+    ]
+    duration_arr = pa.array(duration_vals, type=pa.duration("us"))
+    duration_ser = pd.Series(
+        duration_arr, index=ids, dtype=pd.ArrowDtype(pa.duration("us"))
+    )
+
+    df = pd.DataFrame(
+        {
+            "person": person_ser,
+            "bool_col": bool_ser,
+            "int64_col": int64_ser,
+            "float64_col": float64_ser,
+            "string_col": string_ser,
+            "json_col": json_ser,
+            "date_col": date_ser,
+            "time_col": time_ser,
+            "datetime_col": datetime_ser,
+            "timestamp_col": timestamp_ser,
+            "bytes_col": bytes_ser,
+            "numeric_col": numeric_ser,
+            "bignumeric_col": bignumeric_ser,
+            "geography_col": geo_ser,
+            "duration_col": duration_ser,
+        },
+        index=ids,
+    )
+    df.index.name = "id"
+
     return df
 
 
@@ -834,9 +1041,9 @@ def new_time_series_pandas_df():
     return pd.DataFrame(
         {
             "parsed_date": [
-                datetime(2017, 8, 2, tzinfo=utc),
-                datetime(2017, 8, 3, tzinfo=utc),
-                datetime(2017, 8, 4, tzinfo=utc),
+                datetime.datetime(2017, 8, 2, tzinfo=utc),
+                datetime.datetime(2017, 8, 3, tzinfo=utc),
+                datetime.datetime(2017, 8, 4, tzinfo=utc),
             ],
             "total_visits": [2500, 2500, 2500],
         }
@@ -855,12 +1062,12 @@ def new_time_series_pandas_df_w_id():
     return pd.DataFrame(
         {
             "parsed_date": [
-                datetime(2017, 8, 2, tzinfo=utc),
-                datetime(2017, 8, 2, tzinfo=utc),
-                datetime(2017, 8, 3, tzinfo=utc),
-                datetime(2017, 8, 3, tzinfo=utc),
-                datetime(2017, 8, 4, tzinfo=utc),
-                datetime(2017, 8, 4, tzinfo=utc),
+                datetime.datetime(2017, 8, 2, tzinfo=utc),
+                datetime.datetime(2017, 8, 2, tzinfo=utc),
+                datetime.datetime(2017, 8, 3, tzinfo=utc),
+                datetime.datetime(2017, 8, 3, tzinfo=utc),
+                datetime.datetime(2017, 8, 4, tzinfo=utc),
+                datetime.datetime(2017, 8, 4, tzinfo=utc),
             ],
             "id": ["1", "2", "1", "2", "1", "2"],
             "total_visits": [2500, 2500, 2500, 2500, 2500, 2500],
@@ -1473,7 +1680,7 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent
                 continue
 
             # Ignore the functions less than one day old
-            age = datetime.now() - datetime.fromtimestamp(
+            age = datetime.datetime.now() - datetime.datetime.fromtimestamp(
                 cloud_function.update_time.timestamp()
             )
             if age.days <= 0: