Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions oz_tree_build/images_and_vernaculars/get_wiki_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
import re
import sys
import time
import urllib.request
from pathlib import Path

import requests
Expand Down Expand Up @@ -71,6 +70,9 @@
"img",
)

# See https://meta.wikimedia.org/wiki/User-Agent_policy
wiki_http_headers = {"User-Agent": "OneZoomBot/0.1 (https://www.onezoom.org/; mail@onezoom.org) get-wiki-images/0.1"}


# Copied from OZTree/OZprivate/ServerScripts/Utilities/getEOL_crops.py
def subdir_name(doID):
Expand All @@ -90,11 +92,6 @@ def make_http_request_with_retries(url):
retrying if we get a 429 rate limit error.
"""

# See https://meta.wikimedia.org/wiki/User-Agent_policy
wiki_http_headers = {
"User-Agent": "OneZoomBot/0.1 (https://www.onezoom.org/; " "mail@onezoom.org) get-wiki-images/0.1"
}

retries = 6
delay = 1
for i in range(retries):
Expand Down Expand Up @@ -374,7 +371,11 @@ def save_wiki_image(db, leaf_data, image_name, src, src_id, rating, output_dir,

# Download the uncropped image
uncropped_image_path = f"{image_dir}/{src_id}_uncropped.jpg"
urllib.request.urlretrieve(image_url, uncropped_image_path)
response = requests.get(image_url, headers=wiki_http_headers)
response.raise_for_status()
with open(uncropped_image_path, "wb") as f:
for chunk in response.iter_content(1024):
f.write(chunk)

if cropper is None:
# Default to centering the crop
Expand Down
19 changes: 11 additions & 8 deletions tests/test_get_wiki_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ def __init__(self, status_code, json_data=None, content=None):
def json(self):
return self.json_data

def raise_for_status(self):
if self.status_code != 200:
raise ValueError("status = %d" % self.status_code)

def iter_content(self, chunk_size):
return [self.content] # NB: Ignoring chunk size


class RemoteAPIs:
"""
Expand Down Expand Up @@ -124,6 +131,10 @@ def __init__(self, mock_qid):
# This should not be called: if license is bad => don't download
**self.wikimedia_file_response("BadLicence.jpg", "xxx")
)
self.add_mocked_request(
url="https://upload.wikimedia.org/wikipedia/commons/not/a/real/image.jpg",
response=None, # NB: This maps to MockResponse.json_data, we have none, but content is replaced later
)

# Mock the requests.get function
def mocked_requests_get(self, *args, **kwargs):
Expand All @@ -132,13 +143,6 @@ def mocked_requests_get(self, *args, **kwargs):
return MockResponse(200, self.mocked_requests[args[0]], content)
return MockResponse(404)

def mocked_urlretrieve(self, *args, **kwargs):
# Instead of actually downloading, just copy the test image to the destination
if not args[0].startswith("http"):
raise ValueError("Only HTTP URLs are supported in these tests")
with open(args[1], "wb") as f:
f.write(TINY_JPEG)

# Mock the Azure Vision API smart crop response
def mocked_analyze_from_url(self, *args, **kwargs):
return SimpleNamespace(
Expand Down Expand Up @@ -219,7 +223,6 @@ def wikidata_response(self, image_data, vernacular_data):

def mock_patch_all_web_request_methods(self, f):
@mock.patch("requests.get", side_effect=self.mocked_requests_get)
@mock.patch("urllib.request.urlretrieve", side_effect=self.mocked_urlretrieve)
@mock.patch(
"azure.ai.vision.imageanalysis.ImageAnalysisClient.analyze_from_url",
side_effect=self.mocked_analyze_from_url,
Expand Down