Skip to content

Commit 5ab20c3

Browse files
committed
Merge branch 'flow-migration-stacked' of https://github.com/Omswastik-11/openml-python into flow-migration-stacked
2 parents 9c355cb + f62f606 commit 5ab20c3

11 files changed

Lines changed: 159 additions & 20 deletions

File tree

CONTRIBUTING.md

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,32 @@ $env:OPENML_TEST_SERVER_ADMIN_KEY = "admin-key"
107107
export OPENML_TEST_SERVER_ADMIN_KEY="admin-key"
108108
```
109109
110+
#### Diagnosing Slow Tests
111+
112+
If you suspect a test (or the suite as a whole) is running too slowly, `pytest` already exposes everything you need to investigate it. A few invocations that are useful when looking into test runtimes:
113+
114+
```bash
115+
# Show the 20 slowest tests (use 0 to list every test's duration)
116+
pytest tests --durations=20
117+
118+
# Fail any test that exceeds the given timeout (requires pytest-timeout)
119+
pytest tests --timeout=600
120+
121+
# Investigate only fixture/setup costs without actually running the tests
122+
pytest tests --setup-only
123+
124+
# Profile a specific module, class, or test
125+
pytest tests/test_datasets/test_dataset.py --durations=0
126+
127+
# Skip the slow live-server tests while profiling locally
128+
pytest tests --durations=0 -m "not production_server and not test_server"
129+
130+
# Run the suite in parallel to reproduce CI behaviour (requires pytest-xdist)
131+
pytest tests -n 4 --dist=load --durations=0
132+
```
133+
134+
Combining these with the marker filters (`production_server`, `test_server`, `sklearn`) makes it straightforward to narrow the investigation down to the slow tests without changing project configuration.
135+
110136
### Pull Request Checklist
111137
112138
You can go to the `openml-python` GitHub repository to create the pull request by [comparing the branch](https://github.com/openml/openml-python/compare) from your fork with the `main` branch of the `openml-python` repository. When creating a pull request, make sure to follow the comments and structured provided by the template on GitHub.
@@ -214,4 +240,4 @@ When dependencies are installed, run
214240
```bash
215241
mkdocs serve
216242
```
217-
This will open a preview of the website.
243+
This will open a preview of the website.

examples/Advanced/tasks_tutorial.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,15 @@
2424
#
2525
# We will start by simply listing only *supervised classification* tasks.
2626
#
27-
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
28-
# request a
27+
# **openml.list_tasks()** (or **openml.tasks.list_tasks()**) returns a dictionary of dictionaries by default, but we request a
2928
# [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
3029
# instead to have better visualization capabilities and easier access:
3130

3231
# %%
33-
tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
32+
# New: top-level convenience alias
33+
tasks = openml.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
34+
# Old path still works:
35+
# tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
3436
print(tasks.columns)
3537
print(f"First 5 of {len(tasks)} tasks:")
3638
print(tasks.head())
@@ -66,23 +68,29 @@
6668
# Similar to listing tasks by task type, we can list tasks by tags:
6769

6870
# %%
69-
tasks = openml.tasks.list_tasks(tag="OpenML100")
71+
tasks = openml.list_tasks(tag="OpenML100")
72+
# Old path still works:
73+
# tasks = openml.tasks.list_tasks(tag="OpenML100")
7074
print(f"First 5 of {len(tasks)} tasks:")
7175
print(tasks.head())
7276

7377
# %% [markdown]
7478
# Furthermore, we can list tasks based on the dataset id:
7579

7680
# %%
77-
tasks = openml.tasks.list_tasks(data_id=1471)
81+
tasks = openml.list_tasks(data_id=1471)
82+
# Old path still works:
83+
# tasks = openml.tasks.list_tasks(data_id=1471)
7884
print(f"First 5 of {len(tasks)} tasks:")
7985
print(tasks.head())
8086

8187
# %% [markdown]
8288
# In addition, a size limit and an offset can be applied both separately and simultaneously:
8389

8490
# %%
85-
tasks = openml.tasks.list_tasks(size=10, offset=50)
91+
tasks = openml.list_tasks(size=10, offset=50)
92+
# Old path still works:
93+
# tasks = openml.tasks.list_tasks(size=10, offset=50)
8694
print(tasks)
8795

8896
# %% [markdown]
@@ -98,7 +106,9 @@
98106
# Finally, it is also possible to list all tasks on OpenML with:
99107

100108
# %%
101-
tasks = openml.tasks.list_tasks()
109+
tasks = openml.list_tasks()
110+
# Old path still works:
111+
# tasks = openml.tasks.list_tasks()
102112
print(len(tasks))
103113

104114
# %% [markdown]
@@ -118,7 +128,10 @@
118128

119129
# %%
120130
task_id = 31
121-
task = openml.tasks.get_task(task_id)
131+
# New: top-level convenience alias
132+
task = openml.get_task(task_id)
133+
# Old path still works:
134+
# task = openml.tasks.get_task(task_id)
122135

123136
# %%
124137
# Properties of the task are stored as member variables:

examples/Basics/simple_datasets_tutorial.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,21 @@
1414
# ## List datasets stored on OpenML
1515

1616
# %%
17-
datasets_df = openml.datasets.list_datasets()
17+
# New: top-level convenience alias
18+
datasets_df = openml.list_datasets()
19+
# Old path still works for backwards compatibility:
20+
# datasets_df = openml.datasets.list_datasets()
1821
print(datasets_df.head(n=10))
1922

2023
# %% [markdown]
2124
# ## Download a dataset
2225

2326
# %%
2427
# Iris dataset https://www.openml.org/d/61
25-
dataset = openml.datasets.get_dataset(dataset_id=61)
28+
# New: top-level convenience alias
29+
dataset = openml.get_dataset(dataset_id=61)
30+
# Old path still works:
31+
# dataset = openml.datasets.get_dataset(dataset_id=61)
2632

2733
# Print a summary
2834
print(

examples/Basics/simple_flows_and_runs_tutorial.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@
2929
# NOTE: We are using task 119 from the test server: https://test.openml.org/d/20
3030

3131
# %%
32-
task = openml.tasks.get_task(119)
32+
# New: top-level convenience alias
33+
task = openml.get_task(119)
34+
# Old path still works:
35+
# task = openml.tasks.get_task(119)
3336

3437
# Get the data
3538
dataset = task.get_dataset()
@@ -54,7 +57,7 @@
5457

5558
# %% [markdown]
5659
# ## Upload the machine learning experiments to OpenML
57-
# First, create a fow and fill it with metadata about the machine learning model.
60+
# First, create a flow and fill it with metadata about the machine learning model.
5861

5962
# %%
6063
knn_flow = openml.flows.OpenMLFlow(

examples/Basics/simple_tasks_tutorial.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
# [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):
1111

1212
# %%
13-
task = openml.tasks.get_task(31)
13+
# New: top-level convenience alias
14+
task = openml.get_task(31)
15+
# Old path still works:
16+
# task = openml.tasks.get_task(31)
1417

1518
# %% [markdown]
1619
# Get the dataset and its data from the task.

openml/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,12 @@
3737
from .__version__ import __version__
3838
from ._api import _backend
3939
from .datasets import OpenMLDataFeature, OpenMLDataset
40+
from .datasets.functions import get_dataset, list_datasets
4041
from .evaluations import OpenMLEvaluation
4142
from .flows import OpenMLFlow
43+
from .flows.functions import get_flow, list_flows
4244
from .runs import OpenMLRun
45+
from .runs.functions import get_run, list_runs
4346
from .setups import OpenMLParameter, OpenMLSetup
4447
from .study import OpenMLBenchmarkSuite, OpenMLStudy
4548
from .tasks import (
@@ -51,6 +54,7 @@
5154
OpenMLSupervisedTask,
5255
OpenMLTask,
5356
)
57+
from .tasks.functions import get_task, list_tasks
5458

5559
if TYPE_CHECKING:
5660
from ._config import OpenMLConfigManager
@@ -124,6 +128,14 @@ def populate_cache(
124128
"exceptions",
125129
"extensions",
126130
"flows",
131+
"get_dataset",
132+
"get_flow",
133+
"get_run",
134+
"get_task",
135+
"list_datasets",
136+
"list_flows",
137+
"list_runs",
138+
"list_tasks",
127139
"runs",
128140
"setups",
129141
"study",

openml/datasets/functions.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,11 @@ def get_datasets(
364364
-------
365365
datasets : list of datasets
366366
A list of dataset objects.
367+
368+
Examples
369+
--------
370+
>>> import openml
371+
>>> datasets = openml.datasets.get_datasets([1, 2, 3]) # doctest: +SKIP
367372
"""
368373
datasets = []
369374
for dataset_id in dataset_ids:
@@ -446,6 +451,13 @@ def get_dataset( # noqa: C901, PLR0912
446451
-------
447452
dataset : :class:`openml.OpenMLDataset`
448453
The downloaded dataset.
454+
455+
Examples
456+
--------
457+
>>> import openml
458+
>>> dataset = openml.datasets.get_dataset(1) # doctest: +SKIP
459+
>>> dataset = openml.datasets.get_dataset("iris", version=1) # doctest: +SKIP
460+
>>> dataset = openml.datasets.get_dataset(1, download_data=True) # doctest: +SKIP
449461
"""
450462
if download_all_files:
451463
warnings.warn(

openml/runs/functions.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,15 @@ def run_model_on_task( # noqa: PLR0913
103103
Result of the run.
104104
flow : OpenMLFlow (optional, only if `return_flow` is True).
105105
Flow generated from the model.
106+
107+
Examples
108+
--------
109+
>>> import openml
110+
>>> import openml_sklearn # doctest: +SKIP
111+
>>> from sklearn.tree import DecisionTreeClassifier # doctest: +SKIP
112+
>>> clf = DecisionTreeClassifier() # doctest: +SKIP
113+
>>> task = openml.tasks.get_task(6) # doctest: +SKIP
114+
>>> run = openml.runs.run_model_on_task(clf, task) # doctest: +SKIP
106115
"""
107116
if avoid_duplicate_runs is None:
108117
avoid_duplicate_runs = openml.config.avoid_duplicate_runs
@@ -558,9 +567,14 @@ def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, C901
558567
) # job_rvals contain the output of all the runs with one-to-one correspondence with `jobs`
559568

560569
for n_fit, rep_no, fold_no, sample_no in jobs:
561-
pred_y, proba_y, test_indices, test_y, inner_trace, user_defined_measures_fold = job_rvals[
562-
n_fit - 1
563-
]
570+
(
571+
pred_y,
572+
proba_y,
573+
test_indices,
574+
test_y,
575+
inner_trace,
576+
user_defined_measures_fold,
577+
) = job_rvals[n_fit - 1]
564578

565579
if inner_trace is not None:
566580
traces.append(inner_trace)
@@ -845,7 +859,10 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT0
845859
return _create_run_from_xml(run_xml)
846860

847861

848-
def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, FBT002
862+
def _create_run_from_xml( # noqa: PLR0915, PLR0912, C901
863+
xml: str,
864+
from_server: bool = True, # noqa: FBT002
865+
) -> OpenMLRun:
849866
"""Create a run object from xml returned from server.
850867
851868
Parameters

openml/study/functions.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ def get_suite(suite_id: int | str) -> OpenMLBenchmarkSuite:
2929
-------
3030
OpenMLSuite
3131
The OpenML suite object
32+
33+
Examples
34+
--------
35+
>>> import openml
36+
>>> suite = openml.study.get_suite(99) # doctest: +SKIP
37+
>>> suite = openml.study.get_suite("OpenML-CC18") # doctest: +SKIP
3238
"""
3339
study = _get_study(suite_id, entity_type="task")
3440
assert isinstance(study, OpenMLBenchmarkSuite)
@@ -58,6 +64,11 @@ def get_study(
5864
-------
5965
OpenMLStudy
6066
The OpenML study object
67+
68+
Examples
69+
--------
70+
>>> import openml
71+
>>> study = openml.study.get_study(1) # doctest: +SKIP
6172
"""
6273
if study_id == "OpenML100":
6374
message = (
@@ -108,7 +119,10 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
108119
tags = []
109120
if "oml:tag" in result_dict:
110121
for tag in result_dict["oml:tag"]:
111-
current_tag = {"name": tag["oml:name"], "write_access": tag["oml:write_access"]}
122+
current_tag = {
123+
"name": tag["oml:name"],
124+
"write_access": tag["oml:write_access"],
125+
}
112126
if "oml:window_start" in tag:
113127
current_tag["window_start"] = tag["oml:window_start"]
114128
tags.append(current_tag)
@@ -209,6 +223,15 @@ def create_study(
209223
-------
210224
OpenMLStudy
211225
A local OpenML study object (call publish method to upload to server)
226+
227+
Examples
228+
--------
229+
>>> import openml
230+
>>> study = openml.study.create_study( # doctest: +SKIP
231+
... name="My Study",
232+
... description="A study on decision trees",
233+
... run_ids=[1, 2, 3],
234+
... )
212235
"""
213236
return OpenMLStudy(
214237
study_id=None,

openml/tasks/functions.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,11 @@ def get_tasks(
338338
tasks = []
339339
for task_id in task_ids:
340340
tasks.append(
341-
get_task(task_id, download_data=download_data, download_qualities=download_qualities)
341+
get_task(
342+
task_id,
343+
download_data=download_data,
344+
download_qualities=download_qualities,
345+
)
342346
)
343347
return tasks
344348

@@ -369,6 +373,12 @@ def get_task(
369373
Returns
370374
-------
371375
task: OpenMLTask
376+
377+
Examples
378+
--------
379+
>>> import openml
380+
>>> task = openml.tasks.get_task(1) # doctest: +SKIP
381+
>>> task = openml.tasks.get_task(1, download_splits=True) # doctest: +SKIP
372382
"""
373383
if not isinstance(task_id, int):
374384
raise TypeError(f"Task id should be integer, is {type(task_id)}")

0 commit comments

Comments
 (0)