Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 70 additions & 6 deletions pykew/core.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from urllib.error import HTTPError
import requests
import time
import urllib
import subprocess


IPNI_URL = 'http://beta.ipni.org/api/1'
POWO_URL = 'http://www.plantsoftheworldonline.org/api/2'
POWO_URL = 'https://powo.science.kew.org/api/2'
KPL_URL = 'http://kewplantlist.org/api/v1'

class SearchError(Exception):
pass

class Api:
def __init__(self, url):
self._base_url = url
Expand All @@ -17,19 +23,53 @@ def _url(self, method, params):
opt=urllib.parse.urlencode(params))

def get(self, method, params = {}):
resp = requests.get(self._url(method, params))
# POWO doesn't like calls from Python, so use curl
if self._base_url == POWO_URL:
resp = self._get_using_curl(self._url(method, {k: v for k, v in params.items() if k != "cursor"}))
else:
resp = requests.get(self._url(method, params))

if resp.status_code == 249: # too many requests, retry after 5 sec
time.sleep(5)
return self.get(method, params)
return resp

def _get_using_curl(self, url):
raw_response = subprocess.run(
["curl", "-i", url],
capture_output=True,
text=True,
).stdout
header_part, body = raw_response.split("\n\n", 1)

header_lines = header_part.splitlines()
status_line = header_lines[0]
_, status_code, *_ = status_line.split(" ")
status_code = int(status_code)

headers = {}
for line in header_lines[1:]:
if ":" in line:
k, v = line.split(":", 1)
headers[k.strip()] = v.strip()

response = requests.Response()
response.status_code = status_code
response.headers = headers
response._content = body.encode("utf-8")
response.url = url
response.encoding = "utf-8"

return response

class SearchResult:
def __init__(self, api, query, filters = None):
self._query = query
self._filters = filters
self._api = api
self._cursor = "*"
self._run_query()
self._max_retry_attempts = 3
self._run_query_with_retries()

def _build_params(self):
params = {'perPage': 500, 'cursor': self._cursor}
Expand All @@ -53,14 +93,34 @@ def _format_filters(self):
else:
return self._filters.value

def _run_query_with_retries(self):
if hasattr(self, "_results"):
del self._results
retry_attempt = 0
while retry_attempt < self._max_retry_attempts:
try:
self._run_query()
except HTTPError as e:
retry_attempt += 1
continue

if hasattr(self, "_results"):
return
else:
retry_attempt += 1

raise SearchError(f"Couldn't retrieve results within {self._max_retry_attempts} attempts. The last response was: {self._response}")

def _run_query(self):
params = self._build_params()
response = self._api.get('search', params)
# wait a proportion of server response time of previous call
# before making subsequent calls
self._wait_time = response.elapsed.total_seconds() / 2.0
self._response = response.json()
if 'results' in self._response:
if self._response.get('totalResults') == 0:
self._results = iter([])
elif 'results' in self._response:
self._results = iter(self._response['results'])
if 'cursor' in self._response:
self._cursor = self._response['cursor']
Expand All @@ -71,9 +131,13 @@ def __iter__(self):
def __next__(self):
try:
return next(self._results)
except StopIteration:
except StopIteration as e:
# avoid repeated calls for POWO (they might still make sense for other apps – to be verified later)
if self._api._base_url == POWO_URL:
raise e

time.sleep(self._wait_time)
self._run_query()
self._run_query_with_retries()
return next(self._results)

def size(self):
Expand Down