Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/lib/data/dataset/migrating.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def _dataset_migrate_worker(
migrate_entry: DatasetMigrateEntry = worker_input.entry
destination_backend = migrate_entry.destination_backend
destination_region = migrate_entry.destination_region
destination_data_cred = destination_backend.resolved_data_credential

def _callback(
copy_input: copying.CopyWorkerInput,
Expand All @@ -160,7 +161,11 @@ def _callback(
relative_path=migrate_entry.relative_path,
storage_path=os.path.join(destination_backend.uri, migrate_entry.source_checksum),
url=os.path.join(
destination_backend.parse_uri_to_link(destination_region),
destination_backend.parse_uri_to_link(
destination_region,
override_url=destination_data_cred.override_url,
addressing_style=destination_data_cred.addressing_style,
),
migrate_entry.source_checksum,
),
size=migrate_entry.size,
Expand Down
9 changes: 8 additions & 1 deletion src/lib/data/dataset/uploading.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ def dataset_upload_remote_file_entry_generator(
data_cred = storage_client.data_credential
url_base = storage_backend.parse_uri_to_link(
region=storage_backend.region(data_cred),
override_url=data_cred.override_url,
addressing_style=data_cred.addressing_style,
)

# Iterate over the objects in the remote path.
Expand Down Expand Up @@ -282,6 +284,7 @@ def _write_to_manifest_cache(
etag = utils_common.etag_checksum(upload_entry.source)

destination: storage.StorageBackend = upload_entry.destination
destination_data_cred = destination.resolved_data_credential

def _callback(
upload_input: uploading.UploadWorkerInput,
Expand All @@ -293,7 +296,11 @@ def _callback(
relative_path=upload_entry.relative_path,
storage_path=os.path.join(destination.uri, etag),
url=os.path.join(
destination.parse_uri_to_link(upload_entry.destination_region),
destination.parse_uri_to_link(
upload_entry.destination_region,
override_url=destination_data_cred.override_url,
addressing_style=destination_data_cred.addressing_style,
),
etag,
),
size=upload_entry.size,
Expand Down
75 changes: 65 additions & 10 deletions src/lib/data/storage/backends/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,16 @@ def container_uri(self) -> str:
return f'{self.profile}/{self.container}'

@override
def parse_uri_to_link(self, region: str) -> str:
def parse_uri_to_link(
self,
region: str,
*,
override_url: str | None = None,
addressing_style: str | None = None,
) -> str:
# pylint: disable=unused-argument
"""
Returns the https link corresponding to the uri
Returns the https link corresponding to the uri.
"""
return f'https://{self.netloc}/v1/{self.namespace}/{self.container}/{self.path}'.rstrip('/')

Expand Down Expand Up @@ -456,10 +462,41 @@ def container_uri(self) -> str:
return f'{self.profile}'

@override
def parse_uri_to_link(self, region: str) -> str:
def parse_uri_to_link(
self,
region: str,
*,
override_url: str | None = None,
addressing_style: str | None = None,
) -> str:
"""
Returns the https link corresponding to the uri
Returns the https link corresponding to the uri.

When the credential has an override_url (CAIOS, R2, Wasabi, MinIO,
etc.) we build a URL against that host. The default is virtual-host
addressing because that's what modern S3-compatibles expect; pass
addressing_style='path' for legacy bucket names or path-style-only
deployments.
"""
if override_url:
parsed = parse.urlparse(override_url)
scheme = parsed.scheme or 'https'
if parsed.netloc:
host = parsed.netloc
base_path = parsed.path.rstrip('/')
else:
# No scheme prefix on override_url: urlparse drops the whole
# input into 'path'. Split off the host from any base path so
# reverse-proxied endpoints like 'gateway.example.com/s3' keep
# the '/s3' prefix in the resulting link.
bare = parsed.path.lstrip('/')
host, _, rest = bare.partition('/')
base_path = ('/' + rest).rstrip('/') if rest else ''
if addressing_style == 'path':
return (
f'{scheme}://{host}{base_path}/{self.container}/{self.path}'.rstrip('/')
)
return f'{scheme}://{self.container}.{host}{base_path}/{self.path}'.rstrip('/')
return f'https://{self.container}.s3.{region}.amazonaws.com/{self.path}'.rstrip('/')

@override
Expand Down Expand Up @@ -658,10 +695,16 @@ def container_uri(self) -> str:
return f'{self.profile}'

@override
def parse_uri_to_link(self, region: str) -> str:
def parse_uri_to_link(
self,
region: str,
*,
override_url: str | None = None,
addressing_style: str | None = None,
) -> str:
# pylint: disable=unused-argument
"""
Returns the https link corresponding to the uri
Returns the https link corresponding to the uri.
"""
return (
f'https://storage.googleapis.com/storage/v1/b/{self.container}/o/{self.path}'
Expand Down Expand Up @@ -778,10 +821,16 @@ def container_uri(self) -> str:
return f'{self.profile}'

@override
def parse_uri_to_link(self, region: str) -> str:
def parse_uri_to_link(
self,
region: str,
*,
override_url: str | None = None,
addressing_style: str | None = None,
) -> str:
# pylint: disable=unused-argument
"""
Returns the https link corresponding to the uri
Returns the https link corresponding to the uri.
"""
return f'https://{self.container}.{self.netloc}/{self.path}'.rstrip('/')

Expand Down Expand Up @@ -893,10 +942,16 @@ def container_uri(self) -> str:
return f'{self.profile}/{self.container}'

@override
def parse_uri_to_link(self, region: str) -> str:
def parse_uri_to_link(
self,
region: str,
*,
override_url: str | None = None,
addressing_style: str | None = None,
) -> str:
# pylint: disable=unused-argument
"""
Returns the https link corresponding to the uri
Returns the https link corresponding to the uri.
"""
return f'{self.endpoint}/{self.container}/{self.path}'.rstrip('/')

Expand Down
17 changes: 16 additions & 1 deletion src/lib/data/storage/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,24 @@ def container_uri(self) -> str:
pass

@abc.abstractmethod
def parse_uri_to_link(self, region: str) -> str:
def parse_uri_to_link(
self,
region: str,
*,
override_url: str | None = None,
addressing_style: str | None = None,
) -> str:
"""
Returns the https link corresponding to the uri.

Args:
region: Region used by AWS-pattern URLs when no override is given.
override_url: Custom endpoint (S3-compatible providers). When set,
S3 backends build a URL against this host rather than AWS S3.
Other backends ignore it.
addressing_style: 'virtual' or 'path'. Used by S3 backends with a
custom override_url to select host-based vs path-based URLs.
Defaults to virtual (the form modern S3-compatibles expect).
"""
pass

Expand Down
56 changes: 56 additions & 0 deletions src/lib/data/storage/backends/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,62 @@ def test_env_override_invalid_raises(self):
self.assertIn("'virtua'", str(ctx.exception))


class S3BackendParseUriToLinkTest(unittest.TestCase):
"""Tests for S3Backend.parse_uri_to_link override-aware URL building."""

def _backend(self, uri: str = 's3://my-bucket/foo/bar') -> backends.S3Backend:
return cast(backends.S3Backend, backends.construct_storage_backend(uri))

def test_no_override_uses_aws_pattern(self):
"""Without an override_url, fall back to the AWS-host pattern (preserved)."""
link = self._backend().parse_uri_to_link('us-east-1')
self.assertEqual(link, 'https://my-bucket.s3.us-east-1.amazonaws.com/foo/bar')

def test_override_url_uses_virtual_host(self):
"""With an override_url, build a virtual-host URL against the custom endpoint
(CAIOS, R2, Wasabi, MinIO with wildcard DNS)."""
link = self._backend().parse_uri_to_link(
'US-EAST-14A',
override_url='https://cwobject.com',
)
self.assertEqual(link, 'https://my-bucket.cwobject.com/foo/bar')

def test_override_url_path_style(self):
"""addressing_style='path' yields a path-style URL — needed for
localstack/MinIO setups without wildcard DNS."""
link = self._backend().parse_uri_to_link(
'us-east-1',
override_url='http://localstack-s3.osmo:4566',
addressing_style='path',
)
self.assertEqual(link, 'http://localstack-s3.osmo:4566/my-bucket/foo/bar')

def test_override_url_preserves_scheme(self):
"""Scheme of the override is preserved in the link."""
link = self._backend().parse_uri_to_link(
'us-east-1',
override_url='http://minio.local:9000',
)
self.assertEqual(link, 'http://my-bucket.minio.local:9000/foo/bar')

def test_override_url_preserves_base_path_virtual_host(self):
"""Reverse-proxied endpoint (gateway.example.com/s3) keeps the /s3 prefix."""
link = self._backend().parse_uri_to_link(
'us-east-1',
override_url='https://gateway.example.com/s3',
)
self.assertEqual(link, 'https://my-bucket.gateway.example.com/s3/foo/bar')

def test_override_url_preserves_base_path_path_style(self):
"""Reverse-proxied endpoint with addressing_style=path preserves base path."""
link = self._backend().parse_uri_to_link(
'us-east-1',
override_url='https://gateway.example.com/s3',
addressing_style='path',
)
self.assertEqual(link, 'https://gateway.example.com/s3/my-bucket/foo/bar')


class S3BackendRegionTest(unittest.TestCase):
"""Tests for S3Backend.region() endpoint routing."""

Expand Down
Loading
Loading