Skip to content

Commit f120c19

Browse files
authored
Merge branch 'main' into studies-migration
2 parents 96cee47 + 8cc6429 commit f120c19

14 files changed

Lines changed: 368 additions & 103 deletions

File tree

.github/workflows/test.yml

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,22 +101,40 @@ jobs:
101101
echo "BEFORE=$git_status" >> $GITHUB_ENV
102102
echo "Repository status before tests: $git_status"
103103
104+
- name: Clone Services
105+
if: matrix.os == 'ubuntu-latest'
106+
run: |
107+
git clone --depth 1 https://github.com/openml/services.git
108+
109+
- name: Start Docker Services
110+
if: matrix.os == 'ubuntu-latest'
111+
working-directory: ./services
112+
run: |
113+
docker compose --profile rest-api --profile minio up -d
114+
115+
echo "Waiting for PHP API to boot..."
116+
timeout 60s bash -c 'until [ "$(docker inspect -f {{.State.Health.Status}} openml-php-rest-api)" == "healthy" ]; do sleep 5; done'
117+
118+
echo "Final Verification: Gateway Connectivity..."
119+
curl -sSfL http://localhost:8000/api/v1/xml/data/1 | head -n 15
120+
104121
- name: Show installed dependencies
105122
run: python -m pip list
106123

107124
- name: Run tests on Ubuntu Test
108125
if: matrix.os == 'ubuntu-latest'
109126
env:
110127
OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
128+
OPENML_USE_LOCAL_SERVICES: "true"
111129
run: |
112130
if [ "${{ matrix.code-cov }}" = "true" ]; then
113131
codecov="--cov=openml --long --cov-report=xml"
114132
fi
115133
116134
if [ "${{ matrix.sklearn-only }}" = "true" ]; then
117-
marks="sklearn and not production_server and not test_server"
135+
marks="sklearn and not production_server"
118136
else
119-
marks="not production_server and not test_server"
137+
marks="not production_server"
120138
fi
121139
122140
pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -125,15 +143,16 @@ jobs:
125143
if: matrix.os == 'ubuntu-latest'
126144
env:
127145
OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
146+
OPENML_USE_LOCAL_SERVICES: "true"
128147
run: |
129148
if [ "${{ matrix.code-cov }}" = "true" ]; then
130149
codecov="--cov=openml --long --cov-report=xml"
131150
fi
132151
133152
if [ "${{ matrix.sklearn-only }}" = "true" ]; then
134-
marks="sklearn and production_server and not test_server"
153+
marks="sklearn and production_server"
135154
else
136-
marks="production_server and not test_server"
155+
marks="production_server"
137156
fi
138157
139158
pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -145,6 +164,20 @@ jobs:
145164
run: | # we need a separate step because of the bash-specific if-statement in the previous one.
146165
pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
147166
167+
- name: Upload coverage
168+
if: matrix.code-cov && always()
169+
uses: codecov/codecov-action@v4
170+
with:
171+
files: coverage.xml
172+
token: ${{ secrets.CODECOV_TOKEN }}
173+
fail_ci_if_error: true
174+
verbose: true
175+
176+
- name: Cleanup Docker setup
177+
if: matrix.os == 'ubuntu-latest' && always()
178+
run: |
179+
sudo rm -rf services
180+
148181
- name: Check for files left behind by test
149182
if: matrix.os != 'windows-latest' && always()
150183
run: |
@@ -157,15 +190,6 @@ jobs:
157190
exit 1
158191
fi
159192
160-
- name: Upload coverage
161-
if: matrix.code-cov && always()
162-
uses: codecov/codecov-action@v4
163-
with:
164-
files: coverage.xml
165-
token: ${{ secrets.CODECOV_TOKEN }}
166-
fail_ci_if_error: true
167-
verbose: true
168-
169193
dummy_windows_py_sk024:
170194
name: (windows-latest, Py, sk0.24.*, sk-only:false)
171195
runs-on: ubuntu-latest

openml/runs/functions.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,8 @@ def initialize_model_from_run(run_id: int, *, strict_version: bool = True) -> An
375375
run = get_run(run_id)
376376
# TODO(eddiebergman): I imagine this is None if it's not published,
377377
# might need to raise an explicit error for that
378-
assert run.setup_id is not None
378+
if run.setup_id is None:
379+
raise ValueError(f"Run {run_id} has no associated setup_id. Cannot initialize model.")
379380
return initialize_model(setup_id=run.setup_id, strict_version=strict_version)
380381

381382

@@ -415,7 +416,8 @@ def initialize_model_from_trace(
415416
run = get_run(run_id)
416417
# TODO(eddiebergman): I imagine this is None if it's not published,
417418
# might need to raise an explicit error for that
418-
assert run.flow_id is not None
419+
if run.flow_id is None:
420+
raise ValueError(f"Run {run_id} has no associated flow_id. Cannot initialize model.")
419421

420422
flow = get_flow(run.flow_id)
421423
run_trace = get_run_trace(run_id)
@@ -575,8 +577,10 @@ def _calculate_local_measure( # type: ignore
575577
_user_defined_measures_fold[openml_name] = sklearn_fn(_test_y, _pred_y)
576578

577579
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
578-
assert test_y is not None
579-
assert proba_y is not None
580+
if test_y is None:
581+
raise ValueError("test_y cannot be None for classification tasks.")
582+
if proba_y is None:
583+
raise ValueError("proba_y cannot be None for classification tasks.")
580584

581585
for i, tst_idx in enumerate(test_indices):
582586
if task.class_labels is not None:
@@ -621,7 +625,8 @@ def _calculate_local_measure( # type: ignore
621625
)
622626

623627
elif isinstance(task, OpenMLRegressionTask):
624-
assert test_y is not None
628+
if test_y is None:
629+
raise ValueError("test_y cannot be None for regression tasks.")
625630
for i, _ in enumerate(test_indices):
626631
truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
627632
arff_line = format_prediction(
@@ -742,7 +747,8 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913
742747

743748
if isinstance(task, OpenMLSupervisedTask):
744749
x, y = task.get_X_and_y()
745-
assert isinstance(y, (pd.Series, pd.DataFrame))
750+
if not isinstance(y, (pd.Series, pd.DataFrame)):
751+
raise TypeError(f"y must be a pandas Series or DataFrame, got {type(y).__name__}")
746752
train_x = x.iloc[train_indices]
747753
train_y = y.iloc[train_indices]
748754
test_x = x.iloc[test_indices]
@@ -1212,7 +1218,11 @@ def __list_runs(api_call: str) -> pd.DataFrame:
12121218
f'"http://openml.org/openml": {runs_dict}',
12131219
)
12141220

1215-
assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"])
1221+
if not isinstance(runs_dict["oml:runs"]["oml:run"], list):
1222+
raise TypeError(
1223+
f"Expected runs_dict['oml:runs']['oml:run'] to be a list, "
1224+
f"got {type(runs_dict['oml:runs']['oml:run']).__name__}"
1225+
)
12161226

12171227
runs = {
12181228
int(r["oml:run_id"]): {

openml/runs/run.py

Lines changed: 58 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,57 @@ def to_filesystem(
389389
if self.trace is not None:
390390
self.trace._to_filesystem(directory)
391391

392+
def _get_arff_attributes_for_task(self, task: OpenMLTask) -> list[tuple[str, Any]]:
393+
"""Get ARFF attributes based on task type.
394+
395+
Parameters
396+
----------
397+
task : OpenMLTask
398+
The task for which to generate attributes.
399+
400+
Returns
401+
-------
402+
list[tuple[str, Any]]
403+
List of attribute tuples (name, type).
404+
"""
405+
instance_specifications = [
406+
("repeat", "NUMERIC"),
407+
("fold", "NUMERIC"),
408+
]
409+
410+
if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
411+
instance_specifications.append(("sample", "NUMERIC"))
412+
413+
instance_specifications.append(("row_id", "NUMERIC"))
414+
415+
if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
416+
class_labels = task.class_labels
417+
if class_labels is None:
418+
raise ValueError("The task has no class labels")
419+
420+
prediction_confidences = [
421+
("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
422+
]
423+
prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
424+
return instance_specifications + prediction_and_true + prediction_confidences
425+
426+
if isinstance(task, OpenMLRegressionTask):
427+
return [*instance_specifications, ("prediction", "NUMERIC"), ("truth", "NUMERIC")]
428+
429+
if isinstance(task, OpenMLClusteringTask):
430+
return [*instance_specifications, ("cluster", "NUMERIC")]
431+
432+
supported_task_types = [
433+
TaskType.SUPERVISED_CLASSIFICATION,
434+
TaskType.SUPERVISED_REGRESSION,
435+
TaskType.CLUSTERING,
436+
TaskType.LEARNING_CURVE,
437+
]
438+
raise NotImplementedError(
439+
f"Task type {task.task_type!s} for task_id {getattr(task, 'task_id', None)!s} "
440+
f"is not yet supported. Supported task types are: {supported_task_types!r}"
441+
)
442+
392443
def _generate_arff_dict(self) -> OrderedDict[str, Any]:
393444
"""Generates the arff dictionary for uploading predictions to the
394445
server.
@@ -406,7 +457,8 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]:
406457
if self.data_content is None:
407458
raise ValueError("Run has not been executed.")
408459
if self.flow is None:
409-
assert self.flow_id is not None, "Run has no associated flow id!"
460+
if self.flow_id is None:
461+
raise ValueError("Run has no associated flow id!")
410462
self.flow = get_flow(self.flow_id)
411463

412464
if self.description_text is None:
@@ -417,74 +469,7 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]:
417469
arff_dict["data"] = self.data_content
418470
arff_dict["description"] = self.description_text
419471
arff_dict["relation"] = f"openml_task_{task.task_id}_predictions"
420-
421-
if isinstance(task, OpenMLLearningCurveTask):
422-
class_labels = task.class_labels
423-
instance_specifications = [
424-
("repeat", "NUMERIC"),
425-
("fold", "NUMERIC"),
426-
("sample", "NUMERIC"),
427-
("row_id", "NUMERIC"),
428-
]
429-
430-
arff_dict["attributes"] = instance_specifications
431-
if class_labels is not None:
432-
arff_dict["attributes"] = (
433-
arff_dict["attributes"]
434-
+ [("prediction", class_labels), ("correct", class_labels)]
435-
+ [
436-
("confidence." + class_labels[i], "NUMERIC")
437-
for i in range(len(class_labels))
438-
]
439-
)
440-
else:
441-
raise ValueError("The task has no class labels")
442-
443-
elif isinstance(task, OpenMLClassificationTask):
444-
class_labels = task.class_labels
445-
instance_specifications = [
446-
("repeat", "NUMERIC"),
447-
("fold", "NUMERIC"),
448-
("sample", "NUMERIC"), # Legacy
449-
("row_id", "NUMERIC"),
450-
]
451-
452-
arff_dict["attributes"] = instance_specifications
453-
if class_labels is not None:
454-
prediction_confidences = [
455-
("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
456-
]
457-
prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
458-
arff_dict["attributes"] = (
459-
arff_dict["attributes"] + prediction_and_true + prediction_confidences
460-
)
461-
else:
462-
raise ValueError("The task has no class labels")
463-
464-
elif isinstance(task, OpenMLRegressionTask):
465-
arff_dict["attributes"] = [
466-
("repeat", "NUMERIC"),
467-
("fold", "NUMERIC"),
468-
("row_id", "NUMERIC"),
469-
("prediction", "NUMERIC"),
470-
("truth", "NUMERIC"),
471-
]
472-
473-
elif isinstance(task, OpenMLClusteringTask):
474-
arff_dict["attributes"] = [
475-
("repeat", "NUMERIC"),
476-
("fold", "NUMERIC"),
477-
("row_id", "NUMERIC"),
478-
("cluster", "NUMERIC"),
479-
]
480-
481-
else:
482-
raise NotImplementedError(
483-
f"Task type '{task.task_type}' is not yet supported. "
484-
f"Supported task types: Classification, Regression, Clustering, Learning Curve. "
485-
f"Task ID: {task.task_id}. "
486-
f"Please check the OpenML documentation for supported task types."
487-
)
472+
arff_dict["attributes"] = self._get_arff_attributes_for_task(task)
488473

489474
return arff_dict
490475

@@ -641,7 +626,10 @@ def _get_file_elements(self) -> dict:
641626

642627
if self.parameter_settings is None:
643628
if self.flow is None:
644-
assert self.flow_id is not None # for mypy
629+
if self.flow_id is None:
630+
raise ValueError(
631+
"Run has no associated flow_id and cannot obtain parameter values."
632+
)
645633
self.flow = openml.flows.get_flow(self.flow_id)
646634
self.parameter_settings = self.flow.extension.obtain_parameter_values(
647635
self.flow,

openml/runs/trace.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ def get_parameters(self) -> dict[str, Any]:
9494
for param, value in self.setup_string.items()
9595
}
9696

97-
assert self.parameters is not None
97+
if self.parameters is None:
98+
raise ValueError("Parameters must be set before calling get_parameters().")
9899
return {param[len(PREFIX) :]: value for param, value in self.parameters.items()}
99100

100101

@@ -490,13 +491,21 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace:
490491
for iteration in trace:
491492
key = (iteration.repeat, iteration.fold, iteration.iteration)
492493

493-
assert iteration.parameters is not None
494+
if iteration.parameters is None:
495+
raise ValueError(
496+
f"Iteration parameters cannot be None for repeat {iteration.repeat}, "
497+
f"fold {iteration.fold}, iteration {iteration.iteration}"
498+
)
494499
param_keys = iteration.parameters.keys()
495500

496501
if previous_iteration is not None:
497502
trace_itr = merged_trace[previous_iteration]
498503

499-
assert trace_itr.parameters is not None
504+
if trace_itr.parameters is None:
505+
raise ValueError(
506+
f"Trace iteration parameters cannot be None "
507+
f"for iteration {previous_iteration}"
508+
)
500509
trace_itr_keys = trace_itr.parameters.keys()
501510

502511
if list(param_keys) != list(trace_itr_keys):

tests/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,8 @@ def as_robot() -> Iterator[None]:
289289
@pytest.fixture(autouse=True)
290290
def with_server(request):
291291
openml.config.set_api_version(APIVersion.V1)
292+
if os.getenv("OPENML_USE_LOCAL_SERVICES") == "true":
293+
openml.config.TEST_SERVER_URL = "http://localhost:8000"
292294
if "production_server" in request.keywords:
293295
openml.config.set_servers("production")
294296
yield

0 commit comments

Comments
 (0)