diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c0a77b4..52c4c3bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Add a "Feedback / Feature Requests" section to the README ([\#22](https://github.com/ubccr/xdmod-notebooks/pull/22)). - Improve performance of validation of filters and raw fields ([\#18](https://github.com/ubccr/xdmod-data/pull/18)). - Fix bug with trailing slashes in `xdmod_host` ([\#24](https://github.com/ubccr/xdmod-data/pull/24)). +- Use streaming for raw data requests ([\#19](https://github.com/ubccr/xdmod-data/pull/19)). ## v1.0.0 (2023-07-21) - Initial release. diff --git a/xdmod_data/_http_requester.py b/xdmod_data/_http_requester.py index cc089003..ad5bc939 100644 --- a/xdmod_data/_http_requester.py +++ b/xdmod_data/_http_requester.py @@ -44,24 +44,55 @@ def _request_data(self, params): def _request_raw_data(self, params): url_params = self.__get_raw_data_url_params(params) + # Once XDMoD 10.5 is no longer supported, there will be no need to call + # __get_raw_data_limit(), and the if/else statement below will not be + # necessary — only the body of the 'if' branch will be needed. limit = self.__get_raw_data_limit() data = [] - num_rows = limit - offset = 0 - while num_rows == limit: - response = self._request_json( - path='/rest/v1/warehouse/raw-data?' + url_params - + '&offset=' + str(offset) + if limit == 'NA': + response_iter_lines = self.__request( + path='/rest/v1/warehouse/raw-data?' + url_params, + post_fields=None, + stream=True, ) - partial_data = response['data'] - data += partial_data + i = 0 + for line in response_iter_lines: + line_text = line.decode('utf-8').replace('\x1e', '') + line_json = json.loads(line_text) + if i == 0: + response = {'fields': line_json} + else: + data.append(line_json) + if params['show_progress']: + progress_msg = ( + 'Got ' + str(i) + ' row' + ('' if i == 1 else 's') + + '...' + ) + print(progress_msg, end='\r') + i += 1 if params['show_progress']: - progress_msg = 'Got ' + str(len(data)) + ' rows...' - print(progress_msg, end='\r') - num_rows = len(partial_data) - offset += limit - if params['show_progress']: - print(progress_msg + 'DONE') + print(progress_msg + 'DONE') + else: + num_rows = limit + offset = 0 + while num_rows == limit: + response = self._request_json( + path='/rest/v1/warehouse/raw-data?' + url_params + + '&offset=' + str(offset) + ) + partial_data = response['data'] + data += partial_data + if params['show_progress']: + progress_msg = ( + 'Got ' + str(len(data)) + ' row' + + ('' if len(data) == 1 else 's') + + '...' + ) + print(progress_msg, end='\r') + num_rows = len(partial_data) + offset += limit + if params['show_progress']: + print(progress_msg + 'DONE') return (data, response['fields']) def _request_filter_values(self, realm_id, dimension_id): @@ -98,7 +129,7 @@ def __assert_connection_to_xdmod_host(self): + '\': ' + str(e) ) from None - def __request(self, path='', post_fields=None): + def __request(self, path='', post_fields=None, stream=False): _validator._assert_runtime_context(self.__in_runtime_context) url = self.__xdmod_host + path if post_fields: @@ -126,7 +157,10 @@ def __request(self, path='', post_fields=None): raise RuntimeError( 'Error ' + str(response.status_code) + msg ) from None - return response.text + if stream: + return response.iter_lines() + else: + return response.text def __get_data_post_fields(self, params): post_fields = { @@ -161,8 +195,18 @@ def __get_raw_data_url_params(self, params): ) return urlencode(results) + # Once XDMoD 10.5 is no longer supported, there will be no need for this + # method. def __get_raw_data_limit(self): if self.__raw_data_limit is None: - response = self._request_json('/rest/v1/warehouse/raw-data/limit') - self.__raw_data_limit = int(response['data']) + try: + response = self._request_json( + '/rest/v1/warehouse/raw-data/limit' + ) + self.__raw_data_limit = int(response['data']) + except RuntimeError as e: + if '404' in str(e): + self.__raw_data_limit = 'NA' + else: + raise return self.__raw_data_limit