Skip to content

Commit 4fb5ed8

Browse files
committed
update tests with mock requests
1 parent 0a8cde6 commit 4fb5ed8

2 files changed

Lines changed: 142 additions & 91 deletions

File tree

tests/conftest.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def workdir(tmp_path):
313313
def mock_iris_dataset(requests_mock, test_files_directory):
314314
"""Fixture to provide the iris dataset."""
315315
content_file = (
316-
test_files_directory / "mock_responses" / "datasets" / "61" / "description.xml"
316+
test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
317317
)
318318
requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
319319

@@ -323,8 +323,45 @@ def mock_iris_dataset(requests_mock, test_files_directory):
323323
def mock_titanic_dataset(requests_mock, test_files_directory):
324324
"""Fixture to provide the titanic dataset."""
325325
content_file = (
326-
test_files_directory / "mock_responses" / "datasets" / "40945" / "description.xml"
326+
test_files_directory / "mock_responses" / "datasets" / "data_description_40945.xml"
327327
)
328328
requests_mock.get("https://www.openml.org/api/v1/xml/data/40945", text=content_file.read_text())
329329

330330
yield
331+
332+
333+
@pytest.fixture
334+
def mock_dataset_id_2(requests_mock, test_files_directory):
335+
"""Fixture to provide the dataset ID 2."""
336+
content_file = (
337+
test_files_directory / "mock_responses" / "datasets" / "2" / "description.xml"
338+
)
339+
requests_mock.get("https://www.openml.org/api/v1/xml/data/2", text=content_file.read_text())
340+
341+
data_file = (
342+
test_files_directory / "mock_responses" / "datasets" / "2" / "dataset.arff"
343+
)
344+
requests_mock.get("https://api.openml.org/data/v1/download/1666876/anneal.arff", text=data_file.read_text())
345+
346+
347+
yield
348+
349+
@pytest.fixture
350+
def mock_jm1_dataset(requests_mock, test_files_directory):
351+
"""Fixture to provide the JM1 dataset."""
352+
content_file = (
353+
test_files_directory / "mock_responses" / "datasets" / "data_description_1053.xml"
354+
)
355+
requests_mock.get("https://www.openml.org/api/v1/xml/data/1053", text=content_file.read_text())
356+
357+
yield
358+
359+
@pytest.fixture
360+
def mock_pc4_dataset(requests_mock, test_files_directory):
361+
"""Fixture to provide the PC4 dataset."""
362+
content_file = (
363+
test_files_directory / "mock_responses" / "datasets" / "data_description_1049.xml"
364+
)
365+
requests_mock.get("https://www.openml.org/api/v1/xml/data/1049", text=content_file.read_text())
366+
367+
yield

tests/test_datasets/test_dataset.py

Lines changed: 103 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -73,24 +73,6 @@ def test_repr(self):
7373
str(data)
7474

7575

76-
77-
78-
79-
80-
81-
@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
82-
def test_get_data_boolean_pandas(self):
83-
# test to check that we are converting properly True and False even
84-
# with some inconsistency when dumping the data on openml
85-
data, _, _, _ = self.jm1.get_data()
86-
assert data["defects"].dtype.name == "category"
87-
assert set(data["defects"].cat.categories) == {True, False}
88-
89-
data, _, _, _ = self.pc4.get_data()
90-
assert data["c"].dtype.name == "category"
91-
assert set(data["c"].cat.categories) == {True, False}
92-
93-
9476
@pytest.mark.production
9577
def test_init_string_validation():
9678
with pytest.raises(ValueError, match="Invalid symbols ' ' in name"):
@@ -105,7 +87,7 @@ def test_init_string_validation():
10587
description="a description",
10688
citation="Something by Müller",
10789
)
108-
90+
10991
@pytest.mark.production
11092
def test__unpack_categories_with_nan_likes():
11193
# unpack_categories decodes numeric categorical values according to the header
@@ -117,11 +99,75 @@ def test__unpack_categories_with_nan_likes():
11799
expected_values = ["a", "b", np.nan, np.nan, np.nan, "b", "a"]
118100
assert list(clean_series.values) == expected_values
119101
assert list(clean_series.cat.categories.values) == list("ab")
102+
103+
# expects downloaded data.
104+
@pytest.mark.production
105+
def test_get_data_pandas(mock_titanic_dataset):
106+
titanic = openml.datasets.get_dataset(40945, download_data=False)
107+
data, _, _, _ = titanic.get_data()
108+
assert isinstance(data, pd.DataFrame)
109+
assert data.shape[1] == len(titanic.features)
110+
assert data.shape[0] == 1309
111+
col_dtype = {
112+
"pclass": "uint8",
113+
"survived": "category",
114+
"name": "object",
115+
"sex": "category",
116+
"age": "float64",
117+
"sibsp": "uint8",
118+
"parch": "uint8",
119+
"ticket": "object",
120+
"fare": "float64",
121+
"cabin": "object",
122+
"embarked": "category",
123+
"boat": "object",
124+
"body": "float64",
125+
"home.dest": "object",
126+
}
127+
for col_name in data.columns:
128+
assert data[col_name].dtype.name == col_dtype[col_name]
129+
130+
X, y, _, _ = titanic.get_data(
131+
target=titanic.default_target_attribute,
132+
)
133+
assert isinstance(X, pd.DataFrame)
134+
assert isinstance(y, pd.Series)
135+
assert X.shape == (1309, 13)
136+
assert y.shape == (1309,)
137+
for col_name in X.columns:
138+
assert X[col_name].dtype.name == col_dtype[col_name]
139+
assert y.dtype.name == col_dtype["survived"]
140+
141+
# Why download = False and then expecting data?
142+
@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
143+
@pytest.mark.production
144+
def test_get_data_boolean_pandas(mock_jm1_dataset, mock_pc4_dataset):
145+
# test to check that we are converting properly True and False even
146+
# with some inconsistency when dumping the data on openml
147+
jm1 = openml.datasets.get_dataset(1053, download_data=False)
148+
pc4 = openml.datasets.get_dataset(1049, download_data=False)
120149

150+
data, _, _, _ = jm1.get_data()
151+
assert data["defects"].dtype.name == "category"
152+
assert set(data["defects"].cat.categories) == {True, False}
153+
154+
data, _, _, _ = pc4.get_data()
155+
assert data["c"].dtype.name == "category"
156+
assert set(data["c"].cat.categories) == {True, False}
157+
158+
def _check_expected_type(dtype, is_cat, col):
159+
if is_cat:
160+
expected_type = "category"
161+
elif not col.isna().any() and (col.astype("uint8") == col).all():
162+
expected_type = "uint8"
163+
else:
164+
expected_type = "float64"
165+
166+
assert dtype.name == expected_type
121167

122168
@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
123169
@pytest.mark.production
124-
def test_get_data_with_rowid():
170+
def test_get_data_with_rowid(mock_dataset_id_2):
125171
dataset = openml.datasets.get_dataset(2, download_data=False)
126172
dataset.row_id_attribute = "condition"
127173
rval, _, categorical, _ = dataset.get_data(include_row_id=True)
@@ -138,9 +184,10 @@ def test_get_data_with_rowid():
138184
assert rval.shape == (898, 38)
139185
assert len(categorical) == 38
140186

187+
# same error with check_expected_type. Verify.
141188
@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
142189
@pytest.mark.production
143-
def test_get_data_with_target_pandas():
190+
def test_get_data_with_target_pandas(mock_dataset_id_2):
144191
dataset = openml.datasets.get_dataset(2, download_data=False)
145192
X, y, categorical, attribute_names = dataset.get_data(target="class")
146193
assert isinstance(X, pd.DataFrame)
@@ -155,16 +202,20 @@ def test_get_data_with_target_pandas():
155202

156203
assert "class" not in attribute_names
157204

158-
def _check_expected_type(dtype, is_cat, col):
159-
if is_cat:
160-
expected_type = "category"
161-
elif not col.isna().any() and (col.astype("uint8") == col).all():
162-
expected_type = "uint8"
163-
else:
164-
expected_type = "float64"
165205

166-
assert dtype.name == expected_type
206+
@pytest.mark.production
207+
def test_get_data_rowid_and_ignore_and_target(mock_dataset_id_2):
208+
dataset = openml.datasets.get_dataset(2, download_data=False)
209+
dataset.ignore_attribute = ["condition"]
210+
dataset.row_id_attribute = ["hardness"]
211+
X, y, categorical, names = dataset.get_data(target="class")
212+
assert X.shape == (898, 36)
213+
assert len(categorical) == 36
214+
cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3
215+
assert categorical == cats
216+
assert y.shape == (898,)
167217

218+
# _check_expected_type error. Verify
168219
@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
169220
@pytest.mark.production
170221
def test_get_data_with_ignore_attributes():
@@ -181,69 +232,20 @@ def test_get_data_with_ignore_attributes():
181232
_check_expected_type(dtype, is_cat, rval[col])
182233
assert rval.shape == (898, 38)
183234
assert len(categorical) == 38
184-
185-
186-
@pytest.mark.production
187-
def test_get_data_pandas():
188-
titanic = openml.datasets.get_dataset(40945, download_data=False)
189-
data, _, _, _ = titanic.get_data()
190-
assert isinstance(data, pd.DataFrame)
191-
assert data.shape[1] == len(titanic.features)
192-
assert data.shape[0] == 1309
193-
col_dtype = {
194-
"pclass": "uint8",
195-
"survived": "category",
196-
"name": "object",
197-
"sex": "category",
198-
"age": "float64",
199-
"sibsp": "uint8",
200-
"parch": "uint8",
201-
"ticket": "object",
202-
"fare": "float64",
203-
"cabin": "object",
204-
"embarked": "category",
205-
"boat": "object",
206-
"body": "float64",
207-
"home.dest": "object",
208-
}
209-
for col_name in data.columns:
210-
assert data[col_name].dtype.name == col_dtype[col_name]
211-
212-
X, y, _, _ = titanic.get_data(
213-
target=titanic.default_target_attribute,
214-
)
215-
assert isinstance(X, pd.DataFrame)
216-
assert isinstance(y, pd.Series)
217-
assert X.shape == (1309, 13)
218-
assert y.shape == (1309,)
219-
for col_name in X.columns:
220-
assert X[col_name].dtype.name == col_dtype[col_name]
221-
assert y.dtype.name == col_dtype["survived"]
222-
223-
@pytest.mark.production
224-
def test_get_data_rowid_and_ignore_and_target():
225-
dataset = openml.datasets.get_dataset(2, download_data=False)
226-
dataset.ignore_attribute = ["condition"]
227-
dataset.row_id_attribute = ["hardness"]
228-
X, y, categorical, names = dataset.get_data(target="class")
229-
assert X.shape == (898, 36)
230-
assert len(categorical) == 36
231-
cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3
232-
assert categorical == cats
233-
assert y.shape == (898,)
234-
235+
236+
235237
@pytest.mark.production
236-
def test_get_data_with_nonexisting_class():
238+
def test_get_data_with_nonexisting_class(mock_dataset_id_2):
237239
# This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
238240
# label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
239241
# indices 4 and 5, and that nothing is mapped to index 3.
240242
dataset = openml.datasets.get_dataset(2, download_data=False)
241243
_, y, _, _ = dataset.get_data("class")
242244
assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"]
243245

244-
246+
245247
@pytest.mark.production
246-
def test_get_data_corrupt_pickle():
248+
def test_get_data_corrupt_pickle(mock_iris_dataset):
247249
# Lazy loaded dataset, populate cache.
248250
iris = openml.datasets.get_dataset(61, download_data=False)
249251
iris.get_data()
@@ -255,7 +257,17 @@ def test_get_data_corrupt_pickle():
255257
xy, _, _, _ = iris.get_data()
256258
assert isinstance(xy, pd.DataFrame)
257259
assert xy.shape == (150, 5)
258-
260+
iris.get_data()
261+
# Corrupt pickle file, overwrite as empty.
262+
with open(iris.data_pickle_file, "w") as fh:
263+
fh.write("")
264+
# Despite the corrupt file, the data should be loaded from the ARFF file.
265+
# A warning message is written to the python logger.
266+
xy, _, _, _ = iris.get_data()
267+
assert isinstance(xy, pd.DataFrame)
268+
assert xy.shape == (150, 5)
269+
270+
# check again!
259271
@pytest.mark.production
260272
def test_lazy_loading_metadata():
261273
# Initial Setup
@@ -307,8 +319,11 @@ def test_equality_comparison(mock_iris_dataset, mock_titanic_dataset):
307319
assert iris == iris
308320
assert iris != titanic
309321
assert titanic != "Wrong_object"
310-
311-
322+
323+
324+
325+
326+
312327
def test_tagging():
313328
dataset = openml.datasets.get_dataset(125, download_data=False)
314329

@@ -358,7 +373,6 @@ def test_add_illegal_long_ontology():
358373
assert e.code == 1105
359374

360375

361-
362376
def test_add_illegal_url_ontology():
363377
did = 1
364378
ontology = "not_a_url" + str(time())
@@ -367,8 +381,8 @@ def test_add_illegal_url_ontology():
367381
assert False
368382
except openml.exceptions.OpenMLServerException as e:
369383
assert e.code == 1106
370-
371-
384+
385+
372386
@pytest.mark.production()
373387
class OpenMLDatasetTestSparse(TestBase):
374388
_multiprocess_can_split_ = True

0 commit comments

Comments
 (0)