@@ -73,24 +73,6 @@ def test_repr(self):
7373 str (data )
7474
7575
76-
77-
78-
79-
80-
81- @pytest .mark .skip ("https://github.com/openml/openml-python/issues/1157" )
82- def test_get_data_boolean_pandas (self ):
83- # test to check that we are converting properly True and False even
84- # with some inconsistency when dumping the data on openml
85- data , _ , _ , _ = self .jm1 .get_data ()
86- assert data ["defects" ].dtype .name == "category"
87- assert set (data ["defects" ].cat .categories ) == {True , False }
88-
89- data , _ , _ , _ = self .pc4 .get_data ()
90- assert data ["c" ].dtype .name == "category"
91- assert set (data ["c" ].cat .categories ) == {True , False }
92-
93-
9476@pytest .mark .production
9577def test_init_string_validation ():
9678 with pytest .raises (ValueError , match = "Invalid symbols ' ' in name" ):
@@ -105,7 +87,7 @@ def test_init_string_validation():
10587 description = "a description" ,
10688 citation = "Something by Müller" ,
10789 )
108-
90+
10991@pytest .mark .production
11092def test__unpack_categories_with_nan_likes ():
11193 # unpack_categories decodes numeric categorical values according to the header
@@ -117,11 +99,75 @@ def test__unpack_categories_with_nan_likes():
11799 expected_values = ["a" , "b" , np .nan , np .nan , np .nan , "b" , "a" ]
118100 assert list (clean_series .values ) == expected_values
119101 assert list (clean_series .cat .categories .values ) == list ("ab" )
102+
103+ # expects downloaded data.
104+ @pytest .mark .production
105+ def test_get_data_pandas (mock_titanic_dataset ):
106+ titanic = openml .datasets .get_dataset (40945 , download_data = False )
107+ data , _ , _ , _ = titanic .get_data ()
108+ assert isinstance (data , pd .DataFrame )
109+ assert data .shape [1 ] == len (titanic .features )
110+ assert data .shape [0 ] == 1309
111+ col_dtype = {
112+ "pclass" : "uint8" ,
113+ "survived" : "category" ,
114+ "name" : "object" ,
115+ "sex" : "category" ,
116+ "age" : "float64" ,
117+ "sibsp" : "uint8" ,
118+ "parch" : "uint8" ,
119+ "ticket" : "object" ,
120+ "fare" : "float64" ,
121+ "cabin" : "object" ,
122+ "embarked" : "category" ,
123+ "boat" : "object" ,
124+ "body" : "float64" ,
125+ "home.dest" : "object" ,
126+ }
127+ for col_name in data .columns :
128+ assert data [col_name ].dtype .name == col_dtype [col_name ]
129+
130+ X , y , _ , _ = titanic .get_data (
131+ target = titanic .default_target_attribute ,
132+ )
133+ assert isinstance (X , pd .DataFrame )
134+ assert isinstance (y , pd .Series )
135+ assert X .shape == (1309 , 13 )
136+ assert y .shape == (1309 ,)
137+ for col_name in X .columns :
138+ assert X [col_name ].dtype .name == col_dtype [col_name ]
139+ assert y .dtype .name == col_dtype ["survived" ]
140+
141+ # Why download = False and then expecting data?
142+ @pytest .mark .skip ("https://github.com/openml/openml-python/issues/1157" )
143+ @pytest .mark .production
144+ def test_get_data_boolean_pandas (mock_jm1_dataset , mock_pc4_dataset ):
145+ # test to check that we are converting properly True and False even
146+ # with some inconsistency when dumping the data on openml
147+ jm1 = openml .datasets .get_dataset (1053 , download_data = False )
148+ pc4 = openml .datasets .get_dataset (1049 , download_data = False )
120149
150+ data , _ , _ , _ = jm1 .get_data ()
151+ assert data ["defects" ].dtype .name == "category"
152+ assert set (data ["defects" ].cat .categories ) == {True , False }
153+
154+ data , _ , _ , _ = pc4 .get_data ()
155+ assert data ["c" ].dtype .name == "category"
156+ assert set (data ["c" ].cat .categories ) == {True , False }
157+
158+ def _check_expected_type (dtype , is_cat , col ):
159+ if is_cat :
160+ expected_type = "category"
161+ elif not col .isna ().any () and (col .astype ("uint8" ) == col ).all ():
162+ expected_type = "uint8"
163+ else :
164+ expected_type = "float64"
165+
166+ assert dtype .name == expected_type
121167
122168@pytest .mark .skip ("https://github.com/openml/openml-python/issues/1157" )
123169@pytest .mark .production
124- def test_get_data_with_rowid ():
170+ def test_get_data_with_rowid (mock_dataset_id_2 ):
125171 dataset = openml .datasets .get_dataset (2 , download_data = False )
126172 dataset .row_id_attribute = "condition"
127173 rval , _ , categorical , _ = dataset .get_data (include_row_id = True )
@@ -138,9 +184,10 @@ def test_get_data_with_rowid():
138184 assert rval .shape == (898 , 38 )
139185 assert len (categorical ) == 38
140186
187+ # same error with check_expected_type. Verify.
141188@pytest .mark .skip ("https://github.com/openml/openml-python/issues/1157" )
142189@pytest .mark .production
143- def test_get_data_with_target_pandas ():
190+ def test_get_data_with_target_pandas (mock_dataset_id_2 ):
144191 dataset = openml .datasets .get_dataset (2 , download_data = False )
145192 X , y , categorical , attribute_names = dataset .get_data (target = "class" )
146193 assert isinstance (X , pd .DataFrame )
@@ -155,16 +202,20 @@ def test_get_data_with_target_pandas():
155202
156203 assert "class" not in attribute_names
157204
158- def _check_expected_type (dtype , is_cat , col ):
159- if is_cat :
160- expected_type = "category"
161- elif not col .isna ().any () and (col .astype ("uint8" ) == col ).all ():
162- expected_type = "uint8"
163- else :
164- expected_type = "float64"
165205
166- assert dtype .name == expected_type
206+ @pytest .mark .production
207+ def test_get_data_rowid_and_ignore_and_target (mock_dataset_id_2 ):
208+ dataset = openml .datasets .get_dataset (2 , download_data = False )
209+ dataset .ignore_attribute = ["condition" ]
210+ dataset .row_id_attribute = ["hardness" ]
211+ X , y , categorical , names = dataset .get_data (target = "class" )
212+ assert X .shape == (898 , 36 )
213+ assert len (categorical ) == 36
214+ cats = [True ] * 3 + [False , True , True , False ] + [True ] * 23 + [False ] * 3 + [True ] * 3
215+ assert categorical == cats
216+ assert y .shape == (898 ,)
167217
218+ # _check_expected_type error. Verify
168219@pytest .mark .skip ("https://github.com/openml/openml-python/issues/1157" )
169220@pytest .mark .production
170221def test_get_data_with_ignore_attributes ():
@@ -181,69 +232,20 @@ def test_get_data_with_ignore_attributes():
181232 _check_expected_type (dtype , is_cat , rval [col ])
182233 assert rval .shape == (898 , 38 )
183234 assert len (categorical ) == 38
184-
185-
186- @pytest .mark .production
187- def test_get_data_pandas ():
188- titanic = openml .datasets .get_dataset (40945 , download_data = False )
189- data , _ , _ , _ = titanic .get_data ()
190- assert isinstance (data , pd .DataFrame )
191- assert data .shape [1 ] == len (titanic .features )
192- assert data .shape [0 ] == 1309
193- col_dtype = {
194- "pclass" : "uint8" ,
195- "survived" : "category" ,
196- "name" : "object" ,
197- "sex" : "category" ,
198- "age" : "float64" ,
199- "sibsp" : "uint8" ,
200- "parch" : "uint8" ,
201- "ticket" : "object" ,
202- "fare" : "float64" ,
203- "cabin" : "object" ,
204- "embarked" : "category" ,
205- "boat" : "object" ,
206- "body" : "float64" ,
207- "home.dest" : "object" ,
208- }
209- for col_name in data .columns :
210- assert data [col_name ].dtype .name == col_dtype [col_name ]
211-
212- X , y , _ , _ = titanic .get_data (
213- target = titanic .default_target_attribute ,
214- )
215- assert isinstance (X , pd .DataFrame )
216- assert isinstance (y , pd .Series )
217- assert X .shape == (1309 , 13 )
218- assert y .shape == (1309 ,)
219- for col_name in X .columns :
220- assert X [col_name ].dtype .name == col_dtype [col_name ]
221- assert y .dtype .name == col_dtype ["survived" ]
222-
223- @pytest .mark .production
224- def test_get_data_rowid_and_ignore_and_target ():
225- dataset = openml .datasets .get_dataset (2 , download_data = False )
226- dataset .ignore_attribute = ["condition" ]
227- dataset .row_id_attribute = ["hardness" ]
228- X , y , categorical , names = dataset .get_data (target = "class" )
229- assert X .shape == (898 , 36 )
230- assert len (categorical ) == 36
231- cats = [True ] * 3 + [False , True , True , False ] + [True ] * 23 + [False ] * 3 + [True ] * 3
232- assert categorical == cats
233- assert y .shape == (898 ,)
234-
235+
236+
235237@pytest .mark .production
236- def test_get_data_with_nonexisting_class ():
238+ def test_get_data_with_nonexisting_class (mock_dataset_id_2 ):
237239 # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
238240 # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
239241 # indices 4 and 5, and that nothing is mapped to index 3.
240242 dataset = openml .datasets .get_dataset (2 , download_data = False )
241243 _ , y , _ , _ = dataset .get_data ("class" )
242244 assert list (y .dtype .categories ) == ["1" , "2" , "3" , "4" , "5" , "U" ]
243245
244-
246+
245247@pytest .mark .production
246- def test_get_data_corrupt_pickle ():
248+ def test_get_data_corrupt_pickle (mock_iris_dataset ):
247249 # Lazy loaded dataset, populate cache.
248250 iris = openml .datasets .get_dataset (61 , download_data = False )
249251 iris .get_data ()
@@ -255,7 +257,17 @@ def test_get_data_corrupt_pickle():
255257 xy , _ , _ , _ = iris .get_data ()
256258 assert isinstance (xy , pd .DataFrame )
257259 assert xy .shape == (150 , 5 )
258-
260+ iris .get_data ()
261+ # Corrupt pickle file, overwrite as empty.
262+ with open (iris .data_pickle_file , "w" ) as fh :
263+ fh .write ("" )
264+ # Despite the corrupt file, the data should be loaded from the ARFF file.
265+ # A warning message is written to the python logger.
266+ xy , _ , _ , _ = iris .get_data ()
267+ assert isinstance (xy , pd .DataFrame )
268+ assert xy .shape == (150 , 5 )
269+
270+ # check again!
259271@pytest .mark .production
260272def test_lazy_loading_metadata ():
261273 # Initial Setup
@@ -307,8 +319,11 @@ def test_equality_comparison(mock_iris_dataset, mock_titanic_dataset):
307319 assert iris == iris
308320 assert iris != titanic
309321 assert titanic != "Wrong_object"
310-
311-
322+
323+
324+
325+
326+
312327def test_tagging ():
313328 dataset = openml .datasets .get_dataset (125 , download_data = False )
314329
@@ -358,7 +373,6 @@ def test_add_illegal_long_ontology():
358373 assert e .code == 1105
359374
360375
361-
362376def test_add_illegal_url_ontology ():
363377 did = 1
364378 ontology = "not_a_url" + str (time ())
@@ -367,8 +381,8 @@ def test_add_illegal_url_ontology():
367381 assert False
368382 except openml .exceptions .OpenMLServerException as e :
369383 assert e .code == 1106
370-
371-
384+
385+
372386@pytest .mark .production ()
373387class OpenMLDatasetTestSparse (TestBase ):
374388 _multiprocess_can_split_ = True
0 commit comments