Merge branch 'develop' into maint/pytest_test_dataset_test__read_qualities

Taniya-Das · web-flow · commit a08dff2bd90a · 2025-06-18T11:03:58.000+02:00
diff --git a/.all-contributorsrc b/.all-contributorsrc
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
@@ -1,3 +1,15 @@
+<!--
+It is recommended to check that your issue complies with the
+following rules before submitting:
+
+-  Verify that your issue is not being currently addressed by other
+   issues (https://github.com/openml/openml-python/issues)
+   or pull requests (https://github.com/openml/openml-python/pulls).
+
+-  Please ensure all code snippets and error messages are formatted in
+   appropriate code blocks. See https://help.github.com/articles/creating-and-highlighting-code-blocks
+-->
+
 #### Description
 <!-- Example: Joblib Error thrown when calling fit on LatentDirichletAllocation with evaluate_every > 0-->
 
@@ -20,7 +32,10 @@ it in the issue: https://gist.github.com
 
 #### Versions
 <!--
-Please run the following snippet and paste the output below.
+Please include your operating system type and version number, as well
+as your Python, openml, scikit-learn, numpy, and scipy versions. This information
+can be found by running the following code snippet:
+
 import platform; print(platform.platform())
 import sys; print("Python", sys.version)
 import numpy; print("NumPy", numpy.__version__)
@@ -30,4 +45,5 @@ import openml; print("OpenML", openml.__version__)
 -->
 
 
-<!-- Thanks for contributing! -->
+<!-- Thanks for contributing! -->
+
diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md
@@ -4,8 +4,8 @@ the contribution guidelines: https://github.com/openml/openml-python/blob/main/C
 
 Please make sure that:
 
+* the title of the pull request is descriptive
 * this pull requests is against the `develop` branch
-* you updated all docs, this includes the changelog (doc/progress.rst)
 * for any new function or class added, please add it to doc/api.rst
     * the list of classes and functions should be alphabetical 
 * for any new functionality, consider adding a relevant example
@@ -14,15 +14,20 @@ Please make sure that:
 * add the BSD 3-Clause license to any new file created
 -->
 
-#### Reference Issue
-<!-- Example: Fixes #1234 -->
+#### Metadata
+* Reference Issue: <!-- Example: Fixes #1234 or NA-->
+* New Tests Added: <!-- Yes/No/NA -->
+* Documentation Updated: <!-- Yes/No/NA -->
+* Change Log Entry: <!-- Short String, example: "Add new function `foo()` to module `bar`"; or "Fixes a bug with `bar`" -->
 
 
-#### What does this PR implement/fix? Explain your changes.
-
-
-#### How should this PR be tested?
-
+#### Details 
+<!--
+if necessary, please share the following:
 
-#### Any other comments?
+* What does this PR implement/fix? Explain your changes.
+* Why is this change necessary? What is the problem it solves?
+* How can I reproduce the issue this PR is solving and its solution?
+* Any other comments?
+-->
 
diff --git a/doc/contributing.rst b/doc/contributing.rst
@@ -16,10 +16,7 @@ In particular, a few ways to contribute to openml-python are:
  * A contribution to an openml-python extension. An extension package allows OpenML to interface
    with a machine learning package (such as scikit-learn or keras). These extensions
    are hosted in separate repositories and may have their own guidelines.
-   For more information, see the :ref:`extensions` below.
-
- * Bug reports. If something doesn't work for you or is cumbersome, please open a new issue to let
-   us know about the problem. See `this section <https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md>`_.
+   For more information, see the :ref:`extensions`.
 
  * `Cite OpenML <https://www.openml.org/cite>`_ if you use it in a scientific publication.
 
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -492,6 +492,7 @@ def _create_task_from_xml(xml: str) -> OpenMLTask:
         "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
         "evaluation_measure": evaluation_measures,
     }
+    # TODO: add OpenMLClusteringTask?
     if task_type in (
         TaskType.SUPERVISED_CLASSIFICATION,
         TaskType.SUPERVISED_REGRESSION,
@@ -508,6 +509,10 @@ def _create_task_from_xml(xml: str) -> OpenMLTask:
         common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
             "oml:estimation_procedure"
         ]["oml:type"]
+        common_kwargs["estimation_procedure_id"] = int(
+            inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
+        )
+
         common_kwargs["estimation_parameters"] = estimation_parameters
         common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"]
         common_kwargs["data_splits_url"] = inputs["estimation_procedure"][
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,8 +23,12 @@ dependencies = [
   "packaging",
 ]
 requires-python = ">=3.8"
+maintainers = [
+  { name = "Pieter Gijsbers", email="p.gijsbers@tue.nl"},
+  { name = "Lennart Purucker"},
+]
 authors = [
-  { name = "Matthias Feurer", email="feurerm@informatik.uni-freiburg.de" },
+  { name = "Matthias Feurer"},
   { name = "Jan van Rijn" },
   { name = "Arlind Kadra" },
   { name = "Pieter Gijsbers" },
@@ -52,6 +56,7 @@ classifiers = [
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
 ]
 license = { file = "LICENSE" }
 
@@ -74,6 +79,7 @@ test=[
     "pytest-rerunfailures",
     "mypy",
     "ruff",
+    "requests-mock",
 ]
 examples=[
     "matplotlib",
diff --git a/tests/files/mock_responses/datasets/data_description_61.xml b/tests/files/mock_responses/datasets/data_description_61.xml
@@ -0,0 +1,30 @@
+<oml:data_set_description xmlns:oml="http://openml.org/openml">
+  <oml:id>61</oml:id>
+  <oml:name>iris</oml:name>
+  <oml:version>1</oml:version>
+  <oml:description>**Author**: R.A. Fisher  
+**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  
+**Please cite**:   
+
+**Iris Plants Database**  
+This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda &amp; Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.
+
+Predicted attribute: class of iris plant.  
+This is an exceedingly simple domain.  
+ 
+### Attribute Information:
+    1. sepal length in cm
+    2. sepal width in cm
+    3. petal length in cm
+    4. petal width in cm
+    5. class: 
+       -- Iris Setosa
+       -- Iris Versicolour
+       -- Iris Virginica</oml:description>
+  <oml:description_version>4</oml:description_version>
+  <oml:format>ARFF</oml:format>
+  <oml:creator>R.A. Fisher</oml:creator>     <oml:collection_date>1936</oml:collection_date>  <oml:upload_date>2014-04-06T23:23:39</oml:upload_date>
+  <oml:language>English</oml:language>  <oml:licence>Public</oml:licence>  <oml:url>https://api.openml.org/data/v1/download/61/iris.arff</oml:url>
+  <oml:parquet_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:parquet_url>  <oml:file_id>61</oml:file_id>  <oml:default_target_attribute>class</oml:default_target_attribute>      <oml:version_label>1</oml:version_label>  <oml:citation>https://archive.ics.uci.edu/ml/citation_policy.html</oml:citation>  <oml:tag>Botany</oml:tag><oml:tag>Ecology</oml:tag><oml:tag>Kaggle</oml:tag><oml:tag>Machine Learning</oml:tag><oml:tag>study_1</oml:tag><oml:tag>study_25</oml:tag><oml:tag>study_4</oml:tag><oml:tag>study_41</oml:tag><oml:tag>study_50</oml:tag><oml:tag>study_52</oml:tag><oml:tag>study_7</oml:tag><oml:tag>study_86</oml:tag><oml:tag>study_88</oml:tag><oml:tag>study_89</oml:tag><oml:tag>uci</oml:tag>  <oml:visibility>public</oml:visibility>  <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/Iris</oml:original_data_url>  <oml:paper_url>http://digital.library.adelaide.edu.au/dspace/handle/2440/15227</oml:paper_url>  <oml:minio_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:minio_url>  <oml:status>active</oml:status>
+  <oml:processing_date>2020-11-20 19:02:18</oml:processing_date>      <oml:md5_checksum>ad484452702105cbf3d30f8deaba39a9</oml:md5_checksum>
+</oml:data_set_description>
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -402,20 +402,6 @@ def test_get_sparse_categorical_data_id_395(self):
 class OpenMLDatasetFunctionTest(TestBase):
 
 
-    def test__check_qualities(self):
-        qualities = [{"oml:name": "a", "oml:value": "0.5"}]
-        qualities = openml.datasets.dataset._check_qualities(qualities)
-        assert qualities["a"] == 0.5
-
-        qualities = [{"oml:name": "a", "oml:value": "null"}]
-        qualities = openml.datasets.dataset._check_qualities(qualities)
-        assert qualities["a"] != qualities["a"]
-
-        qualities = [{"oml:name": "a", "oml:value": None}]
-        qualities = openml.datasets.dataset._check_qualities(qualities)
-        assert qualities["a"] != qualities["a"]
-
-
 
 def test__read_features(mocker, workdir, static_cache_dir):
     """Test we read the features from the xml if no cache pickle is available.
@@ -478,3 +464,16 @@ def test__read_qualities(static_cache_dir, workdir, mocker):
     assert pickle_mock.dump.call_count == 1
 
 
+def test__check_qualities():
+    qualities = [{"oml:name": "a", "oml:value": "0.5"}]
+    qualities = openml.datasets.dataset._check_qualities(qualities)
+    assert qualities["a"] == 0.5
+
+    qualities = [{"oml:name": "a", "oml:value": "null"}]
+    qualities = openml.datasets.dataset._check_qualities(qualities)
+    assert qualities["a"] != qualities["a"]
+
+    qualities = [{"oml:name": "a", "oml:value": None}]
+    qualities = openml.datasets.dataset._check_qualities(qualities)
+    assert qualities["a"] != qualities["a"]
+
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -17,6 +17,7 @@
 import pandas as pd
 import pytest
 import requests
+import requests_mock
 import scipy.sparse
 from oslo_concurrency import lockutils
 
@@ -387,14 +388,6 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
             file_destination
         ), "_download_minio_file can download from subdirectories"
 
-    def test__get_dataset_parquet_not_cached(self):
-        description = {
-            "oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq",
-            "oml:id": "20",
-        }
-        path = _get_dataset_parquet(description, cache_directory=self.workdir)
-        assert isinstance(path, Path), "_get_dataset_parquet returns a path"
-        assert path.is_file(), "_get_dataset_parquet returns path to real file"
 
     @mock.patch("openml._api_calls._download_minio_file")
     def test__get_dataset_parquet_is_cached(self, patch):
@@ -1504,16 +1497,6 @@ def test_data_fork(self):
             data_id=999999,
         )
 
-    @pytest.mark.production()
-    def test_get_dataset_parquet(self):
-        # Parquet functionality is disabled on the test server
-        # There is no parquet-copy of the test server yet.
-        openml.config.server = self.production_server
-        dataset = openml.datasets.get_dataset(61, download_data=True)
-        assert dataset._parquet_url is not None
-        assert dataset.parquet_file is not None
-        assert os.path.isfile(dataset.parquet_file)
-        assert dataset.data_file is None  # is alias for arff path
 
     @pytest.mark.production()
     def test_list_datasets_with_high_size_parameter(self):
@@ -1942,6 +1925,16 @@ def test_get_dataset_with_invalid_id() -> None:
         assert e.value.code == 111
 
 
+def test__get_dataset_parquet_not_cached():
+    description = {
+        "oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq",
+        "oml:id": "20",
+    }
+    path = _get_dataset_parquet(description, cache_directory=Path(openml.config.get_cache_directory()))
+    assert isinstance(path, Path), "_get_dataset_parquet returns a path"
+    assert path.is_file(), "_get_dataset_parquet returns path to real file"
+
+
 def test_read_features_from_xml_with_whitespace() -> None:
     from openml.datasets.dataset import _read_features
 
@@ -1950,3 +1943,17 @@ def test_read_features_from_xml_with_whitespace() -> None:
     )
     dict = _read_features(features_file)
     assert dict[1].nominal_values == [" - 50000.", " 50000+."]
+
+
+def test_get_dataset_parquet(requests_mock, test_files_directory):
+    # Parquet functionality is disabled on the test server
+    # There is no parquet-copy of the test server yet.
+    content_file = (
+            test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
+    dataset = openml.datasets.get_dataset(61, download_data=True)
+    assert dataset._parquet_url is not None
+    assert dataset.parquet_file is not None
+    assert os.path.isfile(dataset.parquet_file)
+    assert dataset.data_file is None  # is alias for arff path
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
@@ -15,7 +15,7 @@ def setUp(self, n_levels: int = 1):
         super().setUp()
         self.task_id = 119  # diabetes
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
-        self.estimation_procedure = 1
+        self.estimation_procedure = 5
 
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
@@ -30,7 +30,8 @@ def test_download_task(self):
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION
         assert task.dataset_id == 20
+        assert task.estimation_procedure_id == self.estimation_procedure
 
     def test_class_labels(self):
         task = get_task(self.task_id)
-        assert task.class_labels == ["tested_negative", "tested_positive"]
+        assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
@@ -18,11 +18,11 @@ class OpenMLRegressionTaskTest(OpenMLSupervisedTaskTest):
 
     def setUp(self, n_levels: int = 1):
         super().setUp()
-
+        self.estimation_procedure = 9
         task_meta_data = {
             "task_type": TaskType.SUPERVISED_REGRESSION,
             "dataset_id": 105,  # wisconsin
-            "estimation_procedure_id": 7,
+            "estimation_procedure_id": self.estimation_procedure, # non default value to test estimation procedure id
             "target_name": "time",
         }
         _task_id = check_task_existence(**task_meta_data)
@@ -46,7 +46,7 @@ def setUp(self, n_levels: int = 1):
                     raise Exception(repr(e))
         self.task_id = task_id
         self.task_type = TaskType.SUPERVISED_REGRESSION
-        self.estimation_procedure = 7
+
 
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
@@ -61,3 +61,4 @@ def test_download_task(self):
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.SUPERVISED_REGRESSION
         assert task.dataset_id == 105
+        assert task.estimation_procedure_id == self.estimation_procedure