Skip to content

Commit a08dff2

Browse files
authored
Merge branch 'develop' into maint/pytest_test_dataset_test__read_qualities
2 parents c5598e0 + 5be0d24 commit a08dff2

12 files changed

Lines changed: 216 additions & 229 deletions

.all-contributorsrc

Lines changed: 0 additions & 36 deletions
This file was deleted.

CONTRIBUTING.md

Lines changed: 96 additions & 140 deletions
Large diffs are not rendered by default.

ISSUE_TEMPLATE.md

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
<!--
2+
It is recommended to check that your issue complies with the
3+
following rules before submitting:
4+
5+
- Verify that your issue is not being currently addressed by other
6+
issues (https://github.com/openml/openml-python/issues)
7+
or pull requests (https://github.com/openml/openml-python/pulls).
8+
9+
- Please ensure all code snippets and error messages are formatted in
10+
appropriate code blocks. See https://help.github.com/articles/creating-and-highlighting-code-blocks
11+
-->
12+
113
#### Description
214
<!-- Example: Joblib Error thrown when calling fit on LatentDirichletAllocation with evaluate_every > 0-->
315

@@ -20,7 +32,10 @@ it in the issue: https://gist.github.com
2032

2133
#### Versions
2234
<!--
23-
Please run the following snippet and paste the output below.
35+
Please include your operating system type and version number, as well
36+
as your Python, openml, scikit-learn, numpy, and scipy versions. This information
37+
can be found by running the following code snippet:
38+
2439
import platform; print(platform.platform())
2540
import sys; print("Python", sys.version)
2641
import numpy; print("NumPy", numpy.__version__)
@@ -30,4 +45,5 @@ import openml; print("OpenML", openml.__version__)
3045
-->
3146

3247

33-
<!-- Thanks for contributing! -->
48+
<!-- Thanks for contributing! -->
49+

PULL_REQUEST_TEMPLATE.md

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ the contribution guidelines: https://github.com/openml/openml-python/blob/main/C
44
55
Please make sure that:
66
7+
* the title of the pull request is descriptive
78
* this pull requests is against the `develop` branch
8-
* you updated all docs, this includes the changelog (doc/progress.rst)
99
* for any new function or class added, please add it to doc/api.rst
1010
* the list of classes and functions should be alphabetical
1111
* for any new functionality, consider adding a relevant example
@@ -14,15 +14,20 @@ Please make sure that:
1414
* add the BSD 3-Clause license to any new file created
1515
-->
1616

17-
#### Reference Issue
18-
<!-- Example: Fixes #1234 -->
17+
#### Metadata
18+
* Reference Issue: <!-- Example: Fixes #1234 or NA-->
19+
* New Tests Added: <!-- Yes/No/NA -->
20+
* Documentation Updated: <!-- Yes/No/NA -->
21+
* Change Log Entry: <!-- Short String, example: "Add new function `foo()` to module `bar`"; or "Fixes a bug with `bar`" -->
1922

2023

21-
#### What does this PR implement/fix? Explain your changes.
22-
23-
24-
#### How should this PR be tested?
25-
24+
#### Details
25+
<!--
26+
if necessary, please share the following:
2627
27-
#### Any other comments?
28+
* What does this PR implement/fix? Explain your changes.
29+
* Why is this change necessary? What is the problem it solves?
30+
* How can I reproduce the issue this PR is solving and its solution?
31+
* Any other comments?
32+
-->
2833

doc/contributing.rst

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,7 @@ In particular, a few ways to contribute to openml-python are:
1616
* A contribution to an openml-python extension. An extension package allows OpenML to interface
1717
with a machine learning package (such as scikit-learn or keras). These extensions
1818
are hosted in separate repositories and may have their own guidelines.
19-
For more information, see the :ref:`extensions` below.
20-
21-
* Bug reports. If something doesn't work for you or is cumbersome, please open a new issue to let
22-
us know about the problem. See `this section <https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md>`_.
19+
For more information, see the :ref:`extensions`.
2320

2421
* `Cite OpenML <https://www.openml.org/cite>`_ if you use it in a scientific publication.
2522

openml/tasks/functions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,7 @@ def _create_task_from_xml(xml: str) -> OpenMLTask:
492492
"data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
493493
"evaluation_measure": evaluation_measures,
494494
}
495+
# TODO: add OpenMLClusteringTask?
495496
if task_type in (
496497
TaskType.SUPERVISED_CLASSIFICATION,
497498
TaskType.SUPERVISED_REGRESSION,
@@ -508,6 +509,10 @@ def _create_task_from_xml(xml: str) -> OpenMLTask:
508509
common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
509510
"oml:estimation_procedure"
510511
]["oml:type"]
512+
common_kwargs["estimation_procedure_id"] = int(
513+
inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
514+
)
515+
511516
common_kwargs["estimation_parameters"] = estimation_parameters
512517
common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"]
513518
common_kwargs["data_splits_url"] = inputs["estimation_procedure"][

pyproject.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,12 @@ dependencies = [
2323
"packaging",
2424
]
2525
requires-python = ">=3.8"
26+
maintainers = [
27+
{ name = "Pieter Gijsbers", email="p.gijsbers@tue.nl"},
28+
{ name = "Lennart Purucker"},
29+
]
2630
authors = [
27-
{ name = "Matthias Feurer", email="feurerm@informatik.uni-freiburg.de" },
31+
{ name = "Matthias Feurer"},
2832
{ name = "Jan van Rijn" },
2933
{ name = "Arlind Kadra" },
3034
{ name = "Pieter Gijsbers" },
@@ -52,6 +56,7 @@ classifiers = [
5256
"Programming Language :: Python :: 3.10",
5357
"Programming Language :: Python :: 3.11",
5458
"Programming Language :: Python :: 3.12",
59+
"Programming Language :: Python :: 3.13",
5560
]
5661
license = { file = "LICENSE" }
5762

@@ -74,6 +79,7 @@ test=[
7479
"pytest-rerunfailures",
7580
"mypy",
7681
"ruff",
82+
"requests-mock",
7783
]
7884
examples=[
7985
"matplotlib",
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<oml:data_set_description xmlns:oml="http://openml.org/openml">
2+
<oml:id>61</oml:id>
3+
<oml:name>iris</oml:name>
4+
<oml:version>1</oml:version>
5+
<oml:description>**Author**: R.A. Fisher
6+
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall
7+
**Please cite**:
8+
9+
**Iris Plants Database**
10+
This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda &amp; Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.
11+
12+
Predicted attribute: class of iris plant.
13+
This is an exceedingly simple domain.
14+
15+
### Attribute Information:
16+
1. sepal length in cm
17+
2. sepal width in cm
18+
3. petal length in cm
19+
4. petal width in cm
20+
5. class:
21+
-- Iris Setosa
22+
-- Iris Versicolour
23+
-- Iris Virginica</oml:description>
24+
<oml:description_version>4</oml:description_version>
25+
<oml:format>ARFF</oml:format>
26+
<oml:creator>R.A. Fisher</oml:creator> <oml:collection_date>1936</oml:collection_date> <oml:upload_date>2014-04-06T23:23:39</oml:upload_date>
27+
<oml:language>English</oml:language> <oml:licence>Public</oml:licence> <oml:url>https://api.openml.org/data/v1/download/61/iris.arff</oml:url>
28+
<oml:parquet_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:parquet_url> <oml:file_id>61</oml:file_id> <oml:default_target_attribute>class</oml:default_target_attribute> <oml:version_label>1</oml:version_label> <oml:citation>https://archive.ics.uci.edu/ml/citation_policy.html</oml:citation> <oml:tag>Botany</oml:tag><oml:tag>Ecology</oml:tag><oml:tag>Kaggle</oml:tag><oml:tag>Machine Learning</oml:tag><oml:tag>study_1</oml:tag><oml:tag>study_25</oml:tag><oml:tag>study_4</oml:tag><oml:tag>study_41</oml:tag><oml:tag>study_50</oml:tag><oml:tag>study_52</oml:tag><oml:tag>study_7</oml:tag><oml:tag>study_86</oml:tag><oml:tag>study_88</oml:tag><oml:tag>study_89</oml:tag><oml:tag>uci</oml:tag> <oml:visibility>public</oml:visibility> <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/Iris</oml:original_data_url> <oml:paper_url>http://digital.library.adelaide.edu.au/dspace/handle/2440/15227</oml:paper_url> <oml:minio_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:minio_url> <oml:status>active</oml:status>
29+
<oml:processing_date>2020-11-20 19:02:18</oml:processing_date> <oml:md5_checksum>ad484452702105cbf3d30f8deaba39a9</oml:md5_checksum>
30+
</oml:data_set_description>

tests/test_datasets/test_dataset.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -402,20 +402,6 @@ def test_get_sparse_categorical_data_id_395(self):
402402
class OpenMLDatasetFunctionTest(TestBase):
403403

404404

405-
def test__check_qualities(self):
406-
qualities = [{"oml:name": "a", "oml:value": "0.5"}]
407-
qualities = openml.datasets.dataset._check_qualities(qualities)
408-
assert qualities["a"] == 0.5
409-
410-
qualities = [{"oml:name": "a", "oml:value": "null"}]
411-
qualities = openml.datasets.dataset._check_qualities(qualities)
412-
assert qualities["a"] != qualities["a"]
413-
414-
qualities = [{"oml:name": "a", "oml:value": None}]
415-
qualities = openml.datasets.dataset._check_qualities(qualities)
416-
assert qualities["a"] != qualities["a"]
417-
418-
419405

420406
def test__read_features(mocker, workdir, static_cache_dir):
421407
"""Test we read the features from the xml if no cache pickle is available.
@@ -478,3 +464,16 @@ def test__read_qualities(static_cache_dir, workdir, mocker):
478464
assert pickle_mock.dump.call_count == 1
479465

480466

467+
def test__check_qualities():
468+
qualities = [{"oml:name": "a", "oml:value": "0.5"}]
469+
qualities = openml.datasets.dataset._check_qualities(qualities)
470+
assert qualities["a"] == 0.5
471+
472+
qualities = [{"oml:name": "a", "oml:value": "null"}]
473+
qualities = openml.datasets.dataset._check_qualities(qualities)
474+
assert qualities["a"] != qualities["a"]
475+
476+
qualities = [{"oml:name": "a", "oml:value": None}]
477+
qualities = openml.datasets.dataset._check_qualities(qualities)
478+
assert qualities["a"] != qualities["a"]
479+

tests/test_datasets/test_dataset_functions.py

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import pandas as pd
1818
import pytest
1919
import requests
20+
import requests_mock
2021
import scipy.sparse
2122
from oslo_concurrency import lockutils
2223

@@ -387,14 +388,6 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
387388
file_destination
388389
), "_download_minio_file can download from subdirectories"
389390

390-
def test__get_dataset_parquet_not_cached(self):
391-
description = {
392-
"oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq",
393-
"oml:id": "20",
394-
}
395-
path = _get_dataset_parquet(description, cache_directory=self.workdir)
396-
assert isinstance(path, Path), "_get_dataset_parquet returns a path"
397-
assert path.is_file(), "_get_dataset_parquet returns path to real file"
398391

399392
@mock.patch("openml._api_calls._download_minio_file")
400393
def test__get_dataset_parquet_is_cached(self, patch):
@@ -1504,16 +1497,6 @@ def test_data_fork(self):
15041497
data_id=999999,
15051498
)
15061499

1507-
@pytest.mark.production()
1508-
def test_get_dataset_parquet(self):
1509-
# Parquet functionality is disabled on the test server
1510-
# There is no parquet-copy of the test server yet.
1511-
openml.config.server = self.production_server
1512-
dataset = openml.datasets.get_dataset(61, download_data=True)
1513-
assert dataset._parquet_url is not None
1514-
assert dataset.parquet_file is not None
1515-
assert os.path.isfile(dataset.parquet_file)
1516-
assert dataset.data_file is None # is alias for arff path
15171500

15181501
@pytest.mark.production()
15191502
def test_list_datasets_with_high_size_parameter(self):
@@ -1942,6 +1925,16 @@ def test_get_dataset_with_invalid_id() -> None:
19421925
assert e.value.code == 111
19431926

19441927

1928+
def test__get_dataset_parquet_not_cached():
1929+
description = {
1930+
"oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq",
1931+
"oml:id": "20",
1932+
}
1933+
path = _get_dataset_parquet(description, cache_directory=Path(openml.config.get_cache_directory()))
1934+
assert isinstance(path, Path), "_get_dataset_parquet returns a path"
1935+
assert path.is_file(), "_get_dataset_parquet returns path to real file"
1936+
1937+
19451938
def test_read_features_from_xml_with_whitespace() -> None:
19461939
from openml.datasets.dataset import _read_features
19471940

@@ -1950,3 +1943,17 @@ def test_read_features_from_xml_with_whitespace() -> None:
19501943
)
19511944
dict = _read_features(features_file)
19521945
assert dict[1].nominal_values == [" - 50000.", " 50000+."]
1946+
1947+
1948+
def test_get_dataset_parquet(requests_mock, test_files_directory):
1949+
# Parquet functionality is disabled on the test server
1950+
# There is no parquet-copy of the test server yet.
1951+
content_file = (
1952+
test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
1953+
)
1954+
requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
1955+
dataset = openml.datasets.get_dataset(61, download_data=True)
1956+
assert dataset._parquet_url is not None
1957+
assert dataset.parquet_file is not None
1958+
assert os.path.isfile(dataset.parquet_file)
1959+
assert dataset.data_file is None # is alias for arff path

0 commit comments

Comments
 (0)