update tests with mock requests

Taniya-Das · Taniya-Das · commit 4fb5ed8ef792 · 2025-06-20T17:56:42.000+02:00
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -313,7 +313,7 @@ def workdir(tmp_path):
 def mock_iris_dataset(requests_mock, test_files_directory):
     """Fixture to provide the iris dataset."""
     content_file = (
-        test_files_directory / "mock_responses" / "datasets" / "61" / "description.xml"
+        test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
     )
     requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
     
@@ -323,8 +323,45 @@ def mock_iris_dataset(requests_mock, test_files_directory):
 def mock_titanic_dataset(requests_mock, test_files_directory):
     """Fixture to provide the titanic dataset."""
     content_file = (
-        test_files_directory / "mock_responses" / "datasets" / "40945" / "description.xml"
+        test_files_directory / "mock_responses" / "datasets" / "data_description_40945.xml"
     )
     requests_mock.get("https://www.openml.org/api/v1/xml/data/40945", text=content_file.read_text())
     
     yield
+
+
+@pytest.fixture
+def mock_dataset_id_2(requests_mock, test_files_directory):
+    """Fixture to provide the dataset ID 2."""
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "2" / "description.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/2", text=content_file.read_text())
+    
+    data_file = (
+        test_files_directory / "mock_responses" / "datasets" / "2" / "dataset.arff"
+    )
+    requests_mock.get("https://api.openml.org/data/v1/download/1666876/anneal.arff", text=data_file.read_text())
+    
+    
+    yield
+    
+@pytest.fixture
+def mock_jm1_dataset(requests_mock, test_files_directory):
+    """Fixture to provide the JM1 dataset."""
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_description_1053.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/1053", text=content_file.read_text())
+    
+    yield
+    
+@pytest.fixture
+def mock_pc4_dataset(requests_mock, test_files_directory):
+    """Fixture to provide the PC4 dataset."""
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_description_1049.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/1049", text=content_file.read_text())
+    
+    yield
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -73,24 +73,6 @@ def test_repr(self):
         str(data)
 
 
-
-
-
-
-
-    @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
-    def test_get_data_boolean_pandas(self):
-        # test to check that we are converting properly True and False even
-        # with some inconsistency when dumping the data on openml
-        data, _, _, _ = self.jm1.get_data()
-        assert data["defects"].dtype.name == "category"
-        assert set(data["defects"].cat.categories) == {True, False}
-
-        data, _, _, _ = self.pc4.get_data()
-        assert data["c"].dtype.name == "category"
-        assert set(data["c"].cat.categories) == {True, False}
-        
-        
 @pytest.mark.production        
 def test_init_string_validation():
     with pytest.raises(ValueError, match="Invalid symbols ' ' in name"):
@@ -105,7 +87,7 @@ def test_init_string_validation():
             description="a description",
             citation="Something by Müller",
         )
-        
+
 @pytest.mark.production        
 def test__unpack_categories_with_nan_likes():
     # unpack_categories decodes numeric categorical values according to the header
@@ -117,11 +99,75 @@ def test__unpack_categories_with_nan_likes():
     expected_values = ["a", "b", np.nan, np.nan, np.nan, "b", "a"]
     assert list(clean_series.values) == expected_values
     assert list(clean_series.cat.categories.values) == list("ab")
+
+# expects downloaded data. 
+@pytest.mark.production 
+def test_get_data_pandas(mock_titanic_dataset):
+    titanic = openml.datasets.get_dataset(40945, download_data=False)
+    data, _, _, _ = titanic.get_data()
+    assert isinstance(data, pd.DataFrame)
+    assert data.shape[1] == len(titanic.features)
+    assert data.shape[0] == 1309
+    col_dtype = {
+        "pclass": "uint8",
+        "survived": "category",
+        "name": "object",
+        "sex": "category",
+        "age": "float64",
+        "sibsp": "uint8",
+        "parch": "uint8",
+        "ticket": "object",
+        "fare": "float64",
+        "cabin": "object",
+        "embarked": "category",
+        "boat": "object",
+        "body": "float64",
+        "home.dest": "object",
+    }
+    for col_name in data.columns:
+        assert data[col_name].dtype.name == col_dtype[col_name]
+    
+    X, y, _, _ = titanic.get_data(
+        target=titanic.default_target_attribute,
+    )
+    assert isinstance(X, pd.DataFrame)
+    assert isinstance(y, pd.Series)
+    assert X.shape == (1309, 13)
+    assert y.shape == (1309,)
+    for col_name in X.columns:
+        assert X[col_name].dtype.name == col_dtype[col_name]
+    assert y.dtype.name == col_dtype["survived"]
+
+# Why download = False and then expecting data? 
+@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
+@pytest.mark.production
+def test_get_data_boolean_pandas(mock_jm1_dataset, mock_pc4_dataset):
+    # test to check that we are converting properly True and False even
+    # with some inconsistency when dumping the data on openml
+    jm1 = openml.datasets.get_dataset(1053, download_data=False)
+    pc4 = openml.datasets.get_dataset(1049, download_data=False)
     
+    data, _, _, _ = jm1.get_data()
+    assert data["defects"].dtype.name == "category"
+    assert set(data["defects"].cat.categories) == {True, False}
+
+    data, _, _, _ = pc4.get_data()
+    assert data["c"].dtype.name == "category"
+    assert set(data["c"].cat.categories) == {True, False}
+
+def _check_expected_type(dtype, is_cat, col):
+    if is_cat:
+        expected_type = "category"
+    elif not col.isna().any() and (col.astype("uint8") == col).all():
+        expected_type = "uint8"
+    else:
+        expected_type = "float64"
+
+    assert dtype.name == expected_type
 
 @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
 @pytest.mark.production
-def test_get_data_with_rowid():
+def test_get_data_with_rowid(mock_dataset_id_2):
     dataset = openml.datasets.get_dataset(2, download_data=False)
     dataset.row_id_attribute = "condition"
     rval, _, categorical, _ = dataset.get_data(include_row_id=True)
@@ -138,9 +184,10 @@ def test_get_data_with_rowid():
     assert rval.shape == (898, 38)
     assert len(categorical) == 38
 
+# same error with check_expected_type. Verify. 
 @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
 @pytest.mark.production
-def test_get_data_with_target_pandas():
+def test_get_data_with_target_pandas(mock_dataset_id_2):
     dataset = openml.datasets.get_dataset(2, download_data=False)
     X, y, categorical, attribute_names = dataset.get_data(target="class")
     assert isinstance(X, pd.DataFrame)
@@ -155,16 +202,20 @@ def test_get_data_with_target_pandas():
 
     assert "class" not in attribute_names
 
-def _check_expected_type(dtype, is_cat, col):
-    if is_cat:
-        expected_type = "category"
-    elif not col.isna().any() and (col.astype("uint8") == col).all():
-        expected_type = "uint8"
-    else:
-        expected_type = "float64"
 
-    assert dtype.name == expected_type
+@pytest.mark.production        
+def test_get_data_rowid_and_ignore_and_target(mock_dataset_id_2):
+    dataset = openml.datasets.get_dataset(2, download_data=False)
+    dataset.ignore_attribute = ["condition"]
+    dataset.row_id_attribute = ["hardness"]
+    X, y, categorical, names = dataset.get_data(target="class")
+    assert X.shape == (898, 36)
+    assert len(categorical) == 36
+    cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3
+    assert categorical == cats
+    assert y.shape == (898,) 
 
+# _check_expected_type error. Verify
 @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
 @pytest.mark.production
 def test_get_data_with_ignore_attributes():
@@ -181,69 +232,20 @@ def test_get_data_with_ignore_attributes():
         _check_expected_type(dtype, is_cat, rval[col])
     assert rval.shape == (898, 38)
     assert len(categorical) == 38
-        
-        
-@pytest.mark.production 
-def test_get_data_pandas():
-    titanic = openml.datasets.get_dataset(40945, download_data=False)
-    data, _, _, _ = titanic.get_data()
-    assert isinstance(data, pd.DataFrame)
-    assert data.shape[1] == len(titanic.features)
-    assert data.shape[0] == 1309
-    col_dtype = {
-        "pclass": "uint8",
-        "survived": "category",
-        "name": "object",
-        "sex": "category",
-        "age": "float64",
-        "sibsp": "uint8",
-        "parch": "uint8",
-        "ticket": "object",
-        "fare": "float64",
-        "cabin": "object",
-        "embarked": "category",
-        "boat": "object",
-        "body": "float64",
-        "home.dest": "object",
-    }
-    for col_name in data.columns:
-        assert data[col_name].dtype.name == col_dtype[col_name]
-    
-    X, y, _, _ = titanic.get_data(
-        target=titanic.default_target_attribute,
-    )
-    assert isinstance(X, pd.DataFrame)
-    assert isinstance(y, pd.Series)
-    assert X.shape == (1309, 13)
-    assert y.shape == (1309,)
-    for col_name in X.columns:
-        assert X[col_name].dtype.name == col_dtype[col_name]
-    assert y.dtype.name == col_dtype["survived"]
-               
-@pytest.mark.production        
-def test_get_data_rowid_and_ignore_and_target():
-    dataset = openml.datasets.get_dataset(2, download_data=False)
-    dataset.ignore_attribute = ["condition"]
-    dataset.row_id_attribute = ["hardness"]
-    X, y, categorical, names = dataset.get_data(target="class")
-    assert X.shape == (898, 36)
-    assert len(categorical) == 36
-    cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3
-    assert categorical == cats
-    assert y.shape == (898,)        
-    
+
+
 @pytest.mark.production
-def test_get_data_with_nonexisting_class():
+def test_get_data_with_nonexisting_class(mock_dataset_id_2):
     # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
     # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
     # indices 4 and 5, and that nothing is mapped to index 3.
     dataset = openml.datasets.get_dataset(2, download_data=False)
     _, y, _, _ = dataset.get_data("class")
     assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"]
     
-    
+
 @pytest.mark.production
-def test_get_data_corrupt_pickle():
+def test_get_data_corrupt_pickle(mock_iris_dataset):
     # Lazy loaded dataset, populate cache.
     iris = openml.datasets.get_dataset(61, download_data=False)
     iris.get_data()
@@ -255,7 +257,17 @@ def test_get_data_corrupt_pickle():
     xy, _, _, _ = iris.get_data()
     assert isinstance(xy, pd.DataFrame)
     assert xy.shape == (150, 5)
-    
+    iris.get_data()
+    # Corrupt pickle file, overwrite as empty.
+    with open(iris.data_pickle_file, "w") as fh:
+        fh.write("")
+    # Despite the corrupt file, the data should be loaded from the ARFF file.
+    # A warning message is written to the python logger.
+    xy, _, _, _ = iris.get_data()
+    assert isinstance(xy, pd.DataFrame)
+    assert xy.shape == (150, 5)
+
+# check again!
 @pytest.mark.production 
 def test_lazy_loading_metadata():
     # Initial Setup
@@ -307,8 +319,11 @@ def test_equality_comparison(mock_iris_dataset, mock_titanic_dataset):
     assert iris == iris
     assert iris != titanic
     assert titanic != "Wrong_object"
-
-
+ 
+ 
+ 
+ 
+ 
 def test_tagging():
     dataset = openml.datasets.get_dataset(125, download_data=False)
 
@@ -358,7 +373,6 @@ def test_add_illegal_long_ontology():
         assert e.code == 1105
     
 
-
 def test_add_illegal_url_ontology():
     did = 1
     ontology = "not_a_url" + str(time())
@@ -367,8 +381,8 @@ def test_add_illegal_url_ontology():
         assert False
     except openml.exceptions.OpenMLServerException as e:
         assert e.code == 1106
-
-
+           
+    
 @pytest.mark.production()
 class OpenMLDatasetTestSparse(TestBase):
     _multiprocess_can_split_ = True