Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ repos:
hooks:
- id: sync-with-uv
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.14.11
rev: v0.14.14
hooks:
- id: ruff-check
args: [--fix, --exit-non-zero-on-fix]
Expand Down
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.47.0] - 2026-01-28

### Added

`tilebox-datasets` and `tilebox-workflows`: Added support for pandas v3.

### Changed

- `tilebox-datasets`: The `create_dataset` method of the `Client` has been removed. Use `create_or_update_dataset` instead.
Expand Down Expand Up @@ -304,7 +310,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Released under the [MIT](https://opensource.org/license/mit) license.
- Released packages: `tilebox-datasets`, `tilebox-workflows`, `tilebox-storage`, `tilebox-grpc`

[Unreleased]: https://github.com/tilebox/tilebox-python/compare/v0.46.0...HEAD
[Unreleased]: https://github.com/tilebox/tilebox-python/compare/v0.47.0...HEAD
[0.47.0]: https://github.com/tilebox/tilebox-python/compare/v0.46.0...v0.47.0
[0.46.0]: https://github.com/tilebox/tilebox-python/compare/v0.45.0...v0.46.0
[0.45.0]: https://github.com/tilebox/tilebox-python/compare/v0.44.0...v0.45.0
[0.44.0]: https://github.com/tilebox/tilebox-python/compare/v0.43.0...v0.44.0
Expand Down
86 changes: 86 additions & 0 deletions matrix.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Matrix test configuration for testing pandas compatibility across Python versions
# Run with: pymatrix --config matrix.toml
#
# Split into scenarios per package due to pytest conftest collision when running
# multiple packages together (each has tests/conftest.py).

[[scenarios]]
name = "datasets-pandas2"
python = ["3.10", "3.11", "3.12", "3.13"]
working-dir = "tilebox-datasets"
test-command = "pytest"
test-args = ["-v"]

[scenarios.packages]
pandas = ["2.2.3"]

[[scenarios]]
name = "datasets-pandas3"
python = ["3.11", "3.12", "3.13"] # pandas 3.0 requires Python 3.11+
working-dir = "tilebox-datasets"
test-command = "pytest"
test-args = ["-v"]

[scenarios.packages]
pandas = ["3.0.0"]

[[scenarios]]
name = "storage-pandas2"
python = ["3.10", "3.11", "3.12", "3.13"]
working-dir = "tilebox-storage"
test-command = "pytest"
test-args = ["-v"]

[scenarios.packages]
pandas = ["2.2.3"]

[[scenarios]]
name = "storage-pandas3"
python = ["3.11", "3.12", "3.13"] # pandas 3.0 requires Python 3.11+
working-dir = "tilebox-storage"
test-command = "pytest"
test-args = ["-v"]

[scenarios.packages]
pandas = ["3.0.0"]

[[scenarios]]
name = "grpc-pandas2"
python = ["3.10", "3.11", "3.12", "3.13"]
working-dir = "tilebox-grpc"
test-command = "pytest"
test-args = ["-v"]

[scenarios.packages]
pandas = ["2.2.3"]

[[scenarios]]
name = "grpc-pandas3"
python = ["3.11", "3.12", "3.13"] # pandas 3.0 requires Python 3.11+
working-dir = "tilebox-grpc"
test-command = "pytest"
test-args = ["-v"]

[scenarios.packages]
pandas = ["3.0.0"]

[[scenarios]]
name = "workflows-pandas2"
python = ["3.10", "3.11", "3.12", "3.13"]
working-dir = "tilebox-workflows"
test-command = "pytest"
# Ignore FutureWarning: google-cloud-storage raises deprecation warning on Python 3.10
test-args = ["-v", "-W", "ignore::FutureWarning"]

[scenarios.packages]
pandas = ["2.2.3"]

[[scenarios]]
name = "workflows-pandas3"
python = ["3.11", "3.12", "3.13"] # pandas 3.0 requires Python 3.11+
working-dir = "tilebox-workflows"
test-command = "pytest"
test-args = ["-v"]

[scenarios.packages]
pandas = ["3.0.0"]
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ dev = [
# DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0)
"pyarrow>=17.0.0",
# some dev tooling
"ruff>=0.11.10",
"ruff>=0.14.10",
"types-protobuf>=6.30",
"junitparser>=3.2.0",
"ty>=0.0.11",
"ty>=0.0.14",
"prek>=0.2.27",
]

Expand Down
1 change: 0 additions & 1 deletion tilebox-datasets/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ dev = [
"pytest>=8.3.2",
]


[project.urls]
Homepage = "https://tilebox.com"
Documentation = "https://docs.tilebox.com/datasets/introduction"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from uuid import UUID

import pandas as pd
import pytest
from hypothesis import given, settings
from hypothesis.strategies import lists
Expand Down Expand Up @@ -152,21 +153,21 @@ def test_convert_datapoints(datapoints: list[ExampleDatapoint]) -> None: # noqa
for uuid in dataset.some_id.to_numpy():
assert isinstance(uuid, str)

# strings should be stored as object arrays, with None as the fill value if missing
# strings should be stored as object arrays, with missing values (None or NaN) as fill
if "some_string" in dataset:
for string in dataset.some_string.to_numpy():
assert string is None or isinstance(string, str)
assert pd.isna(string) or isinstance(string, str)
if "some_repeated_string" in dataset:
for string in dataset.some_repeated_string.to_numpy().ravel():
assert string is None or isinstance(string, str)
assert pd.isna(string) or isinstance(string, str)

# bytes should be stored as object arrays, with None as the fill value if missing
# bytes should be stored as object arrays, with missing values (None or NaN) as fill
if "some_bytes" in dataset:
for bytes_ in dataset.some_bytes.to_numpy():
assert bytes_ is None or isinstance(bytes_, bytes)
assert pd.isna(bytes_) or isinstance(bytes_, bytes)
if "some_repeated_bytes" in dataset:
for bytes_ in dataset.some_repeated_bytes.to_numpy().ravel():
assert bytes_ is None or isinstance(bytes_, bytes)
assert pd.isna(bytes_) or isinstance(bytes_, bytes)


@given(lists(example_datapoints(missing_fields=True), min_size=1, max_size=10))
Expand Down
7 changes: 6 additions & 1 deletion tilebox-datasets/tilebox/datasets/progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
from types import TracebackType
from typing import Any

try:
from typing import Self # ty: ignore[unresolved-import]
except ImportError: # Self is only available in Python 3.11+
from typing_extensions import Self

from tqdm.auto import tqdm

from tilebox.datasets.query.time_interval import TimeInterval
Expand Down Expand Up @@ -42,7 +47,7 @@ def __init__(
self._actual_start_time = actual_start_time
self._total_data_points = 0

def __enter__(self) -> "TimeIntervalProgressBar":
def __enter__(self) -> Self:
self._progress_bar = tqdm(
bar_format="{l_bar}{bar}[{elapsed}<{remaining}{postfix}]",
total=self._calc_progress_seconds(self._interval.end),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from uuid import UUID

import numpy as np
import pandas as pd
from google.protobuf.descriptor import FieldDescriptor
from google.protobuf.duration_pb2 import Duration
from google.protobuf.message import Message
Expand All @@ -17,6 +18,8 @@
from tilebox.datasets.datasets.v1.well_known_types_pb2 import Geometry, LatLon, LatLonAlt, Quaternion, Vec3

ScalarProtoFieldValue = Message | float | str | bool | bytes


ProtoFieldValue = ScalarProtoFieldValue | Sequence[ScalarProtoFieldValue] | None

_FILL_VALUES_BY_DTYPE: dict[type[np.dtype[Any]], Any] = {
Expand Down Expand Up @@ -107,7 +110,7 @@ def from_proto(self, value: ProtoFieldValue) -> int:
return value.seconds * 10**9 + value.nanos

def to_proto(self, value: DatetimeScalar) -> Timestamp | None:
if value is None or (isinstance(value, np.datetime64) and np.isnat(value)):
if is_missing(value) or (isinstance(value, np.datetime64) and np.isnat(value)):
return None
# we use pandas to_datetime function to handle a variety of input types that can be coerced to datetimes
seconds, nanos = divmod(to_datetime(value, utc=True).value, 10**9)
Expand All @@ -124,10 +127,10 @@ def from_proto(self, value: ProtoFieldValue) -> int:
return value.seconds * 10**9 + value.nanos

def to_proto(self, value: str | float | timedelta | np.timedelta64) -> Duration | None:
if value is None or (isinstance(value, np.timedelta64) and np.isnat(value)):
if is_missing(value) or (isinstance(value, np.timedelta64) and np.isnat(value)):
return None
# we use pandas to_timedelta function to handle a variety of input types that can be coerced to timedeltas
seconds, nanos = divmod(to_timedelta(value).value, 10**9) # type: ignore[arg-type]
seconds, nanos = divmod(to_timedelta(value).value, 10**9)
return Duration(seconds=seconds, nanos=nanos)


Expand All @@ -141,7 +144,7 @@ def from_proto(self, value: ProtoFieldValue) -> str:
return str(UUID(bytes=value.uuid))

def to_proto(self, value: str | UUID) -> UUIDMessage | None:
if not value: # None or empty string
if is_missing(value) or value == "": # missing or empty string
return None

if isinstance(value, str):
Expand All @@ -160,7 +163,7 @@ def from_proto(self, value: ProtoFieldValue) -> Any:
return from_wkb(value.wkb)

def to_proto(self, value: Any) -> Geometry | None:
if value is None:
if is_missing(value):
return None
return Geometry(wkb=value.wkb)

Expand All @@ -175,7 +178,7 @@ def from_proto(self, value: ProtoFieldValue) -> tuple[float, float, float]:
return value.x, value.y, value.z

def to_proto(self, value: tuple[float, float, float]) -> Vec3 | None:
if value is None or np.all(np.isnan(value)):
if is_missing(value) or np.all(np.isnan(value)):
return None
return Vec3(x=value[0], y=value[1], z=value[2])

Expand All @@ -190,7 +193,7 @@ def from_proto(self, value: ProtoFieldValue) -> tuple[float, float, float, float
return value.q1, value.q2, value.q3, value.q4

def to_proto(self, value: tuple[float, float, float, float]) -> Quaternion | None:
if value is None or np.all(np.isnan(value)):
if is_missing(value) or np.all(np.isnan(value)):
return None
return Quaternion(q1=value[0], q2=value[1], q3=value[2], q4=value[3])

Expand All @@ -205,7 +208,7 @@ def from_proto(self, value: ProtoFieldValue) -> tuple[float, float]:
return value.latitude, value.longitude

def to_proto(self, value: tuple[float, float]) -> LatLon | None:
if value is None or np.all(np.isnan(value)):
if is_missing(value) or np.all(np.isnan(value)):
return None
return LatLon(latitude=value[0], longitude=value[1])

Expand All @@ -221,7 +224,7 @@ def from_proto(self, value: ProtoFieldValue) -> tuple[float, float, float]:
return value.latitude, value.longitude, value.altitude

def to_proto(self, value: tuple[float, float, float]) -> LatLonAlt | None:
if value is None or np.all(np.isnan(value)):
if is_missing(value) or np.all(np.isnan(value)):
return None
return LatLonAlt(latitude=value[0], longitude=value[1], altitude=value[2])

Expand Down Expand Up @@ -301,3 +304,19 @@ def _camel_to_uppercase(name: str) -> str:
'PROCESSING_LEVEL'
"""
return "".join(["_" + c.lower() if c.isupper() else c for c in name]).lstrip("_").upper()


def is_missing(value: Any) -> bool:
"""Check if a value represents a missing/null value.

Handles None, np.nan, pd.NA, NaT, and other pandas missing value sentinels.
This is needed for pandas 3.0+ compatibility where object-dtype columns use
np.nan instead of None for missing values.
"""
try:
return bool(pd.isna(value))
except ValueError:
# pd.isna returns either a bool, or an array of bools. In case of an array, converting the result to bool()
# will raise a ValueError. For an array, we know it's not a missing value, even an array of all NaNs is not
# a missing value.
return False
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ProtobufFieldType,
ProtoFieldValue,
infer_field_type,
is_missing,
)

IngestionData = Mapping[str, Collection[Any]] | Iterable[tuple[str, Collection[Any]]] | pd.DataFrame | xr.Dataset
Expand Down Expand Up @@ -120,7 +121,7 @@ def convert_values_to_proto(
values: np.ndarray | pd.Series, field_type: ProtobufFieldType, filter_none: bool = False
) -> list[ProtoFieldValue]:
if filter_none:
return [field_type.to_proto(value) for value in values if value is not None]
return [field_type.to_proto(value) for value in values if not is_missing(value)]
return [field_type.to_proto(value) for value in values]


Expand Down
20 changes: 7 additions & 13 deletions tilebox-datasets/tilebox/datasets/query/id_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,16 @@ def parse(cls, arg: IDIntervalLike, start_exclusive: bool = False, end_inclusive
Returns:
IDInterval: The parsed ID interval
"""
if isinstance(arg, IDInterval):
return arg

match arg:
case IDInterval(_, _, _, _):
return arg
case (UUID(), UUID()):
start: UUID = arg[0]
end: UUID = arg[1]
if isinstance(arg, tuple) and len(arg) == 2:
start, end = arg
if isinstance(start, UUID) and isinstance(end, UUID):
return IDInterval(
start_id=start,
end_id=end,
start_exclusive=start_exclusive,
end_inclusive=end_inclusive,
start_id=start, end_id=end, start_exclusive=start_exclusive, end_inclusive=end_inclusive
)
case (str(), str()):
start: str = arg[0]
end: str = arg[1]
if isinstance(start, str) and isinstance(end, str):
return IDInterval(
start_id=UUID(start),
end_id=UUID(end),
Expand Down
4 changes: 3 additions & 1 deletion tilebox-datasets/tilebox/datasets/query/time_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@

# A type alias for the different types that can be used to specify a time interval
TimeIntervalLike: TypeAlias = (
DatetimeScalar | tuple[DatetimeScalar, DatetimeScalar] | xr.DataArray | xr.Dataset | "TimeInterval"
"DatetimeScalar | tuple[DatetimeScalar, DatetimeScalar] | xr.DataArray | xr.Dataset | TimeInterval"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's the correct fix here, since class TimeInterval is only defined below, I'll take a look

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently there is one difference between typing.Union and the more modern | operator:

typing.Union[int | float | "SomeString"] works, but int | float | SomeString| doesn't.

Pandas changed from typing.Union to | that's why we got the error now, so the fix for us is to use typing.Union ourselfes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh - noticed just now that you quoted the whole thing in a string, that works too, but I feel like that's the less elegant version 😅

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can do that 👍

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nevermind ty doesn't like the old union with a string in it it seems, so I'll jut go for the full string solution.

)
# once we require python >= 3.12 we can replace this with a type statement, which doesn't require a string at all
# type TimeIntervalLike = DatetimeScalar | tuple[DatetimeScalar ... | TimeInterval


@dataclass(frozen=True)
Expand Down
1 change: 0 additions & 1 deletion tilebox-grpc/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ dependencies = [
"nest-asyncio>=1.5.0",
]


[dependency-groups]
dev = ["pytest-asyncio>=0.24.0", "pytest-cov>=5.0.0", "pytest>=8.3.2"]

Expand Down
1 change: 1 addition & 0 deletions tilebox-storage/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"folium>=0.15",
"shapely>=2",
"obstore>=0.8.0",
"boto3>=1.37.0", # required for the obstore Boto3CredentialProvider
]

[dependency-groups]
Expand Down
8 changes: 7 additions & 1 deletion tilebox-workflows/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@ dependencies = [
]

[dependency-groups]
dev = ["hypothesis>=6.112.1", "pytest-cov>=5.0.0", "pytest>=8.3.2", "moto>=5"]
dev = [
"hypothesis>=6.112.1",
"pytest-cov>=5.0.0",
"pytest>=8.3.2",
"moto>=5",
"pytest-asyncio>=1.3.0",
]

[project.urls]
Homepage = "https://tilebox.com"
Expand Down
Loading
Loading