Skip to content

Commit 1b67ace

Browse files
committed
Merge branch 'main' into move_utils
2 parents 0069e78 + 0769ff5 commit 1b67ace

5 files changed

Lines changed: 92 additions & 13 deletions

File tree

.github/workflows/test.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ concurrency:
2222

2323
jobs:
2424
test:
25-
name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }},sk-only:${{ matrix.sklearn-only }})
25+
name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }}${{ matrix.pandas-version != '' && format(',pd:{0}', matrix.pandas-version) || '' }},sk-only:${{ matrix.sklearn-only }})
2626
runs-on: ${{ matrix.os }}
2727

2828
strategy:
@@ -64,6 +64,14 @@ jobs:
6464
sklearn-only: "false"
6565
code-cov: true
6666

67+
# Pandas 2 run
68+
- os: ubuntu-latest
69+
python-version: "3.12"
70+
scikit-learn: "1.5.*"
71+
sklearn-only: "false"
72+
pandas-version: "2.*"
73+
code-cov: false
74+
6775
steps:
6876
- uses: actions/checkout@v6
6977
with:
@@ -74,10 +82,16 @@ jobs:
7482
with:
7583
python-version: ${{ matrix.python-version }}
7684

77-
- name: Install test dependencies and scikit-learn
85+
- name: Install test dependencies, scikit-learn, and optional pandas
86+
shell: bash
7887
run: |
7988
python -m pip install --upgrade pip
8089
pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
90+
91+
if [ "${{ matrix.pandas-version }}" != "" ]; then
92+
echo "Installing specific pandas version: ${{ matrix.pandas-version }}"
93+
pip install "pandas==${{ matrix.pandas-version }}"
94+
fi
8195
8296
- name: Store repository status
8397
id: status-before

openml/datasets/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915
488488
try:
489489
# checks if the strings which should be the class labels
490490
# can be encoded into integers
491-
pd.factorize(type_)[0]
491+
pd.factorize(np.array(type_))[0]
492492
except ValueError as e:
493493
raise ValueError(
494494
"Categorical data needs to be numeric when using sparse ARFF."

openml/utils/_openml.py

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,20 @@
22
from __future__ import annotations
33

44
import contextlib
5+
import re
56
import shutil
67
import warnings
7-
from collections.abc import Callable, Mapping, Sized
8+
from abc import ABC, abstractmethod
9+
from collections.abc import Callable, Iterable, Mapping, Sequence, Sized
810
from functools import wraps
911
from pathlib import Path
10-
from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload
12+
from typing import (
13+
TYPE_CHECKING,
14+
Any,
15+
Literal,
16+
TypeVar,
17+
overload,
18+
)
1119
from typing_extensions import ParamSpec
1220

1321
import numpy as np
@@ -469,3 +477,57 @@ def update(self, length: int) -> None:
469477
self._progress_bar.update(length)
470478
if self._progress_bar.total <= self._progress_bar.n:
471479
self._progress_bar.close()
480+
481+
482+
class ReprMixin(ABC):
483+
"""A mixin class that provides a customizable string representation for OpenML objects.
484+
485+
This mixin standardizes the __repr__ output format across OpenML classes.
486+
Classes inheriting from this mixin should implement the
487+
_get_repr_body_fields method to specify which fields to display.
488+
"""
489+
490+
def __repr__(self) -> str:
491+
body_fields = self._get_repr_body_fields()
492+
return self._apply_repr_template(body_fields)
493+
494+
@abstractmethod
495+
def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
496+
"""Collect all information to display in the __repr__ body.
497+
498+
Returns
499+
-------
500+
body_fields : List[Tuple[str, Union[str, int, List[str]]]]
501+
A list of (name, value) pairs to display in the body of the __repr__.
502+
E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
503+
If value is a List of str, then each item of the list will appear in a separate row.
504+
"""
505+
# Should be implemented in the base class.
506+
507+
def _apply_repr_template(
508+
self,
509+
body_fields: Iterable[tuple[str, str | int | list[str] | None]],
510+
) -> str:
511+
"""Generates the header and formats the body for string representation of the object.
512+
513+
Parameters
514+
----------
515+
body_fields: List[Tuple[str, str]]
516+
A list of (name, value) pairs to display in the body of the __repr__.
517+
"""
518+
# We add spaces between capitals, e.g. ClassificationTask -> Classification Task
519+
name_with_spaces = re.sub(
520+
r"(\w)([A-Z])",
521+
r"\1 \2",
522+
self.__class__.__name__[len("OpenML") :],
523+
)
524+
header_text = f"OpenML {name_with_spaces}"
525+
header = f"{header_text}\n{'=' * len(header_text)}\n"
526+
527+
_body_fields: list[tuple[str, str | int | list[str]]] = [
528+
(k, "None" if v is None else v) for k, v in body_fields
529+
]
530+
longest_field_name_length = max(len(name) for name, _ in _body_fields)
531+
field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
532+
body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
533+
return header + body

tests/test_datasets/test_dataset.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,21 +102,24 @@ def test_get_data_pandas(self):
102102
assert isinstance(data, pd.DataFrame)
103103
assert data.shape[1] == len(self.titanic.features)
104104
assert data.shape[0] == 1309
105+
# Dynamically detect what this version of Pandas calls string columns.
106+
str_dtype = data["name"].dtype.name
107+
105108
col_dtype = {
106109
"pclass": "uint8",
107110
"survived": "category",
108-
"name": "object",
111+
"name": str_dtype,
109112
"sex": "category",
110113
"age": "float64",
111114
"sibsp": "uint8",
112115
"parch": "uint8",
113-
"ticket": "object",
116+
"ticket": str_dtype,
114117
"fare": "float64",
115-
"cabin": "object",
118+
"cabin": str_dtype,
116119
"embarked": "category",
117-
"boat": "object",
120+
"boat": str_dtype,
118121
"body": "float64",
119-
"home.dest": "object",
122+
"home.dest": str_dtype,
120123
}
121124
for col_name in data.columns:
122125
assert data[col_name].dtype.name == col_dtype[col_name]
@@ -357,7 +360,7 @@ def setUp(self):
357360
def test_get_sparse_dataset_dataframe_with_target(self):
358361
X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
359362
assert isinstance(X, pd.DataFrame)
360-
assert isinstance(X.dtypes[0], pd.SparseDtype)
363+
assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
361364
assert X.shape == (600, 20000)
362365

363366
assert isinstance(y, pd.Series)

tests/test_flows/test_flow_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,9 @@ def _check_flow(self, flow):
4141
assert isinstance(flow["full_name"], str)
4242
assert isinstance(flow["version"], str)
4343
# There are some runs on openml.org that can have an empty external version
44+
ext_version = flow["external_version"]
4445
ext_version_str_or_none = (
45-
isinstance(flow["external_version"], str) or flow["external_version"] is None
46+
isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
4647
)
4748
assert ext_version_str_or_none
4849

@@ -338,7 +339,6 @@ def test_get_flow_reinstantiate_model_no_extension(self):
338339
reason="Requires scikit-learn!=0.19.1, because target flow is from that version.",
339340
)
340341
@pytest.mark.production()
341-
@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
342342
def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
343343
self.use_production_server()
344344
flow = 8175

0 commit comments

Comments
 (0)