Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
25ba161
Merge pull request #491 from MetaCell/main
ddelpiano Jul 7, 2025
aee7e71
Merge pull request #493 from MetaCell/main
ddelpiano Jul 22, 2025
1046552
SCKAN-443 feat: Add population_file param to ingest_statements; Updat…
afonsobspinto Sep 22, 2025
c0bad68
Update applications/composer/backend/composer/management/commands/ing…
afonsobspinto Sep 23, 2025
4883883
SCKAN-443 feat: Use set instead of list for population_uris
afonsobspinto Sep 23, 2025
5580625
Merge branch 'feature/SCKAN-443' of github.com:MetaCell/sckan-compose…
afonsobspinto Sep 23, 2025
da7ebc9
SCKAN-434 fix: Change add provenance route to not use path params
afonsobspinto Sep 24, 2025
7f8cd8c
SCKAN-434 feat: Add provenance validation
afonsobspinto Sep 24, 2025
b63b748
SCKAN-434 feat: Add fix provenances command
afonsobspinto Sep 24, 2025
c87eedc
SCKAN-434 feat: Alter provenance from URIField to custom field
afonsobspinto Sep 24, 2025
aceeba7
SCKAN-434 feat: Add isLoading external prop to FormBase
afonsobspinto Sep 24, 2025
8918e43
SCKAN-434 refactor: Apply copilot feedback
afonsobspinto Sep 24, 2025
662f0a0
SCKAN-434 style: Apply linting
afonsobspinto Sep 24, 2025
c970063
SCKAN-443 feat: Make population_file restrict what's ingested
afonsobspinto Sep 29, 2025
80a1f97
Merge pull request #506 from MetaCell/feature/SCKAN-443
ddelpiano Sep 29, 2025
654df36
Merge pull request #507 from MetaCell/feature/SCKAN-434
ddelpiano Sep 29, 2025
c1a3084
bumping version up
ddelpiano Sep 29, 2025
2beb36a
SCKAN-434 fix: Make doi regex more rigid
afonsobspinto Sep 30, 2025
42107fe
Merge pull request #508 from MetaCell/feature/SCKAN-434-b
ddelpiano Oct 1, 2025
c9955fa
SCKAN-443 feat: Prevent transitions when population uri is provided
afonsobspinto Oct 2, 2025
4429268
Merge pull request #509 from MetaCell/feature/SCKAN-443-b
ddelpiano Oct 2, 2025
083ddd6
updating zlib1g
ddelpiano Oct 3, 2025
bce4110
updating imagemagick
ddelpiano Oct 3, 2025
c63c33e
updating libgnutls
ddelpiano Oct 3, 2025
5e2f4b1
updating bluez
ddelpiano Oct 3, 2025
fe988cf
updating libxml2
ddelpiano Oct 3, 2025
5c3074d
updating starlette
ddelpiano Oct 3, 2025
ccf29ad
updating tiff
ddelpiano Oct 3, 2025
7671b68
updating pillow
ddelpiano Oct 3, 2025
e2db9d6
updating libxsl
ddelpiano Oct 3, 2025
bb05c85
updating django and some more system libs
ddelpiano Oct 3, 2025
33c3e33
updating perl
ddelpiano Oct 3, 2025
a731027
fixing libtiff with new package
ddelpiano Oct 6, 2025
af02cd9
SCKAN-443 feat: Rollback transition skip
afonsobspinto Oct 6, 2025
47b778f
SCKAN-443 feat: Improve error message
afonsobspinto Oct 6, 2025
0839c78
SCKAN-443 feat: Add transition to invalid on export transition failure
afonsobspinto Oct 6, 2025
cef27ce
moving up some packages
ddelpiano Oct 6, 2025
e8ef5cc
SCKAN-443 chore: Update ingestement tests
afonsobspinto Oct 6, 2025
3e44e08
SCKAN-443 chore: Remove trailing whitespace
afonsobspinto Oct 6, 2025
d059f8e
Merge pull request #511 from MetaCell/feature/SCKAN-443-c
ddelpiano Oct 6, 2025
261d3c6
another upgrade
ddelpiano Oct 6, 2025
76b0c17
Merge pull request #510 from MetaCell/feature/vulnerabilities
ddelpiano Oct 6, 2025
310af57
increasing field size
ddelpiano Oct 6, 2025
c2c5255
Merge pull request #512 from MetaCell/feature/field_length_fix
ddelpiano Oct 6, 2025
36e7e26
fix: Bump django-admin-sortable2 version
afonsobspinto Oct 6, 2025
83b476d
Merge pull request #514 from MetaCell/feature/fix-dependencies
afonsobspinto Oct 6, 2025
5fcc078
fix: Correct PopulationSet get_or_create
afonsobspinto Oct 9, 2025
c7fee58
fix: Correct form base spinner logic
afonsobspinto Oct 16, 2025
c7a377d
chore: Bump composer version
afonsobspinto Oct 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion applications/composer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,20 @@ ENV MODULE_NAME=backend \
APP_DIR=/usr/src/app

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends --allow-downgrades \
nginx supervisor git \
zlib1g \
zlib1g-dev \
imagemagick \
libgnutls30 \
bluez \
libxml2 \
libtiff6 \
libxslt1.1 \
libexpat1 \
perl \
&& apt-get install --only-upgrade zlib1g zlib1g-dev -y \
&& rm -rf /var/lib/apt/lists/*

WORKDIR ${APP_DIR}
Expand Down
2 changes: 2 additions & 0 deletions applications/composer/backend/backend/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@
"http://localhost:8000/",
"https://localhost:8000/",
"https://127.0.0.1:8000/",
"http://localhost:3000/",
"https://localhost:3000/",
]

# override django admin base template with a local template
Expand Down
5 changes: 5 additions & 0 deletions applications/composer/backend/composer/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,11 @@ class Meta:
fields = ("id", "uri", "connectivity_statement_id")


class ProvenanceCreateSerializer(serializers.Serializer):
"""Serializer for creating provenance via request body"""
uri = serializers.CharField(required=True)


class SentenceConnectivityStatement(serializers.ModelSerializer):
"""Connectivity Statement"""

Expand Down
23 changes: 10 additions & 13 deletions applications/composer/backend/composer/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
TagSerializer,
ViaSerializer,
ProvenanceSerializer,
ProvenanceCreateSerializer,
SexSerializer,
PopulationSetSerializer,
ConnectivityStatementUpdateSerializer,
Expand Down Expand Up @@ -151,23 +152,19 @@ class ProvenanceMixin(
viewsets.GenericViewSet,
):
@extend_schema(
parameters=[
OpenApiParameter(
"uri",
OpenApiTypes.STR,
location=OpenApiParameter.PATH,
required=True,
)
],
request=None,
request=ProvenanceCreateSerializer,
responses={200: "ConnectivityStatement updated successfully"},
)
@action(detail=True, methods=["post"], url_path="add_provenance/(?P<uri>.*)")
def add_provenance(self, request, pk=None, uri=None):
procenance, created = Provenance.objects.get_or_create(
@action(detail=True, methods=["post"], url_path="add_provenance")
def add_provenance(self, request, pk=None):
serializer = ProvenanceCreateSerializer(data=request.data)
serializer.is_valid(raise_exception=True)

uri = serializer.validated_data['uri']
provenance, created = Provenance.objects.get_or_create(
connectivity_statement_id=pk,
uri=uri,
)
procenance.save()
instance = self.get_object()
return Response(self.get_serializer(instance).data)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Fix Provenance URIs Command

This Django management command fixes provenance URIs that were affected by the URL decoding bug, where URIs are missing a slash after the protocol (e.g., `https:/example.com` instead of `https://example.com`).

## Usage

### Basic usage (dry run - recommended first):
```bash
python manage.py fix_provenance_uris --dry-run
```

### Apply the fixes:
```bash
python manage.py fix_provenance_uris
```

### Custom options:
```bash
python manage.py fix_provenance_uris --dry-run --batch-size 500 --output-file my_fixes.txt
```

## Options

- `--dry-run`: Show what would be fixed without making changes (recommended for testing)
- `--output-file`: Specify output file for results (default: auto-generated with timestamp)
- `--batch-size`: Number of provenances to process in each batch (default: 1000)

## Output

The command creates a detailed text file containing:
1. List of fixed provenance IDs (one per line) for easy scripting
2. Detailed change log showing original and fixed URIs
3. Associated connectivity statement IDs
4. Timestamp and summary information

## What it fixes

The command identifies and fixes URIs with these patterns:
- `http:/example.com` → `http://example.com`
- `https:/example.com` → `https://example.com`

It validates fixes to ensure they:
- Add exactly one slash after the protocol
- Don't break already valid URIs
- Follow expected URI patterns

## Safety features

- **Dry run mode**: Test before applying changes
- **Validation**: Ensures fixes are logical and safe
- **Atomic transactions**: All changes are made atomically
- **Progress reporting**: Shows progress for large datasets
- **Detailed logging**: Records all changes for audit purposes
- **Batch processing**: Efficient memory usage for large datasets

## Example output file

```
# Fixed Provenance URIs Report
# Generated: 2025-09-24 15:30:45
# Total fixed: 3
# Format: ID | Original URI | Fixed URI | Connectivity Statement ID

# Fixed Provenance IDs (one per line):
123
456
789

# Detailed Changes:
ID: 123
Connectivity Statement: 15
Original: https:/www.example.com
Fixed: https://www.example.com
Change: Added missing slash after protocol
------------------------------------------------------------
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import os
import re
from datetime import datetime
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from django.conf import settings
from composer.models import Provenance


class Command(BaseCommand):
help = 'Fix provenance URIs that were affected by the URL decoding bug (missing slash after protocol)'

def add_arguments(self, parser):
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be fixed without making changes',
)
parser.add_argument(
'--output-file',
type=str,
default=None,
help='Output file for fixed provenance IDs (default: fixed_provenance_ids_YYYY-MM-DD_HH-MM-SS.txt)',
)
parser.add_argument(
'--batch-size',
type=int,
default=1000,
help='Batch size for processing provenances (default: 1000)',
)

def handle(self, *args, **options):
dry_run = options['dry_run']
batch_size = options['batch_size']

# Generate output filename with timestamp if not provided
if options['output_file']:
output_file = options['output_file']
else:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
mode = "dry_run_" if dry_run else ""
output_file = f'fixed_provenance_ids_{mode}{timestamp}.txt'

self.stdout.write(self.style.SUCCESS('Starting provenance URI fix...'))

if dry_run:
self.stdout.write(self.style.WARNING('DRY RUN MODE - No changes will be made'))

# Patterns to identify and fix broken URIs
fix_patterns = [
(r'^http:/([^/].+)$', r'http://\1'), # http:/domain -> http://domain
(r'^https:/([^/].+)$', r'https://\1'), # https:/domain -> https://domain
]

fixed_provenances = []
total_checked = 0

try:
# Get total count for progress
total_provenances = Provenance.objects.count()
self.stdout.write(f'Found {total_provenances} provenances to check...')

# Process in batches for efficiency
with transaction.atomic():
# Use iterator to avoid loading all objects into memory
provenance_qs = Provenance.objects.select_related('connectivity_statement').all()

for provenance in provenance_qs.iterator(chunk_size=batch_size):
total_checked += 1
original_uri = provenance.uri
fixed_uri = None

# Try each pattern to find a fix
for pattern, replacement in fix_patterns:
if re.match(pattern, original_uri):
fixed_uri = re.sub(pattern, replacement, original_uri)
break

# If we found a fix, validate and apply it
if fixed_uri and fixed_uri != original_uri:
if self.is_valid_fix(original_uri, fixed_uri):
fix_info = {
'id': provenance.id,
'original_uri': original_uri,
'fixed_uri': fixed_uri,
'connectivity_statement_id': provenance.connectivity_statement_id
}

self.stdout.write(
f'ID {provenance.id}: {original_uri} -> {fixed_uri}'
)

if not dry_run:
provenance.uri = fixed_uri
provenance.save(update_fields=['uri'])

fixed_provenances.append(fix_info)
else:
self.stdout.write(
self.style.WARNING(
f'Skipping ID {provenance.id}: Invalid fix pattern {original_uri} -> {fixed_uri}'
)
)

# Progress indicator
if total_checked % batch_size == 0:
self.stdout.write(f'Processed {total_checked}/{total_provenances}...')

# Write detailed results to file
self.write_detailed_results_file(fixed_provenances, output_file, dry_run)

# Summary
self.stdout.write(
self.style.SUCCESS(
f'\nProcess completed!\n'
f'Total provenances checked: {total_checked}\n'
f'Provenances fixed: {len(fixed_provenances)}\n'
f'Results written to: {output_file}'
)
)

if dry_run and fixed_provenances:
self.stdout.write(
self.style.WARNING(
'Run without --dry-run to apply these fixes.'
)
)
elif not fixed_provenances:
self.stdout.write(
self.style.SUCCESS(
'No broken URIs found! All provenances appear to be correct.'
)
)

except Exception as e:
raise CommandError(f'Error processing provenances: {e}')

def is_valid_fix(self, original, fixed):
"""
Validate that the fix makes sense:
- Should add exactly one slash after protocol
- Result should be a reasonable URI pattern
- Should not break valid URIs
"""
# Check for http:/ -> http:// conversion
if original.startswith('http:/') and not original.startswith('http://'):
if fixed.startswith('http://') and len(fixed) == len(original) + 1:
# Make sure we didn't break a valid URI like http://example.com
if original.startswith('http://'):
return False
return True

# Check for https:/ -> https:// conversion
if original.startswith('https:/') and not original.startswith('https://'):
if fixed.startswith('https://') and len(fixed) == len(original) + 1:
# Make sure we didn't break a valid URI like https://example.com
if original.startswith('https://'):
return False
return True

return False

def write_detailed_results_file(self, fixed_provenances, filename, dry_run):
"""Write detailed results including IDs and URI changes to a text file"""
mode_text = "DRY RUN - " if dry_run else ""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

try:
with open(filename, 'w', encoding='utf-8') as f:
# Header
f.write(f"# {mode_text}Fixed Provenance URIs Report\n")
f.write(f"# Generated: {timestamp}\n")
f.write(f"# Total fixed: {len(fixed_provenances)}\n")
f.write(f"# Format: ID | Original URI | Fixed URI | Connectivity Statement ID\n")
f.write("#" + "="*80 + "\n\n")

if not fixed_provenances:
f.write("No broken URIs found.\n")
else:
# Write summary of IDs only (for easy scripting)
f.write("# Fixed Provenance IDs (one per line):\n")
for item in fixed_provenances:
f.write(f"{item['id']}\n")

f.write(f"\n# Detailed Changes:\n")

# Write detailed changes
for item in fixed_provenances:
f.write(f"ID: {item['id']}\n")
f.write(f" Connectivity Statement: {item['connectivity_statement_id']}\n")
f.write(f" Original: {item['original_uri']}\n")
f.write(f" Fixed: {item['fixed_uri']}\n")
f.write(f" Change: Added missing slash after protocol\n")
f.write("-" * 60 + "\n")

self.stdout.write(f'Detailed results written to {filename}')

except Exception as e:
self.stdout.write(
self.style.ERROR(f'Failed to write results file: {e}')
)
Loading
Loading