diff --git a/applications/composer/backend/composer/admin.py b/applications/composer/backend/composer/admin.py index b017c530..17069fc8 100644 --- a/applications/composer/backend/composer/admin.py +++ b/applications/composer/backend/composer/admin.py @@ -374,25 +374,24 @@ class ConnectivityStatementAdmin( readonly_fields = ( "state", "curie_id", - "has_statement_been_exported", "reference_uri", + "population_index" ) - exclude = ("journey_path", "statement_prefix", "statement_suffix", "population_index") + exclude = ("journey_path", "statement_prefix", "statement_suffix", ) autocomplete_fields = ("sentence", "origins") date_hierarchy = "modified_date" list_display = ( "sentence", - "pmid", - "pmcid", "short_ks", + "population_set_name", + "population_index", "tag_list", "state", - "has_notes", "owner", ) - list_display_links = ("sentence", "pmid", "pmcid", "short_ks", "state") - list_filter = ("state", "owner", "tags__tag") - list_select_related = ("sentence", "origins", "destinations") + list_display_links = ("sentence", "short_ks", "state") + list_filter = ("state", "population", "owner", "tags__tag") + list_select_related = ("sentence", "population", "owner", "origins", "destinations") search_fields = ( "sentence__title", "sentence__text", @@ -433,17 +432,9 @@ def delete_queryset(self, request, queryset): def short_ks(self, obj): return str(obj) - @admin.display(description="PMID") - def pmid(self, obj): - return obj.sentence.pmid - - @admin.display(description="PMCID") - def pmcid(self, obj): - return obj.sentence.pmcid - - @admin.display(description="REFERENCE") - def reference(self, obj): - return str(obj) + @admin.display(description="Population Set") + def population_set_name(self, obj): + return obj.population.name if obj.population else "-" class ExportBatchAdmin(admin.ModelAdmin): diff --git a/applications/composer/backend/composer/management/commands/README_reassign_population_indices.md b/applications/composer/backend/composer/management/commands/README_reassign_population_indices.md new file mode 100644 index 00000000..c56d6039 --- /dev/null +++ b/applications/composer/backend/composer/management/commands/README_reassign_population_indices.md @@ -0,0 +1,181 @@ +# Reassign Population Indices Command + +## Overview + +This Django management command reassigns `population_index` values to connectivity statements based on patterns found in their `curie_id` field. It processes all population sets and their associated exported statements. + +## Purpose + +The command is designed to: +1. Extract hypothetical population indices from `curie_id` fields +2. Handle conflicts where multiple statements have the same hypothetical index +3. Assign sequential indices to statements that couldn't get a hypothetical index +4. Update the `last_used_index` on population sets + +## Expected curie_id Pattern + +The command expects `curie_id` values to follow this pattern: +``` +neuron type {population_name} {population_index} +``` + +For example: +- `neuron type rat 1` +- `neuron type mouse 42` + +## How It Works + +### Phase 1: Analysis +- Retrieves all statements with `has_statement_been_exported=True` for each population +- Extracts the hypothetical population index from each statement's `curie_id` +- Tracks statements that don't match the pattern or have no `curie_id` + +### Phase 2: Index Assignment +- For unique hypothetical indices: Assigns directly +- For conflicts (multiple statements with same index): + - Statement with smallest `id` (created earliest) gets the index + - Other statements go into a "bag" for later assignment +- Logs all assignments and conflicts + +### Phase 3: Bag Processing +- Statements in the bag (conflicts or no pattern match) get assigned sequential indices +- Starting from the last used index + 1 +- Maintains creation order (sorted by statement `id`) + +### Phase 4: Database Update +- Updates `population_index` on each statement +- Updates `last_used_index` on the population set +- All changes wrapped in a transaction + +## Usage + +### Basic Usage +```bash +python manage.py reassign_population_indices +``` + +### Dry Run (Preview Changes) +```bash +python manage.py reassign_population_indices --dry-run +``` + +### Process Specific Population +```bash +python manage.py reassign_population_indices --population "rat" +``` + +### Custom Log File +```bash +python manage.py reassign_population_indices --output-file /path/to/logfile.log +``` + +### Combined Options +```bash +python manage.py reassign_population_indices --dry-run --population "mouse" --output-file mouse_dry_run.log +``` + +## Command Options + +| Option | Description | Default | +|--------|-------------|---------| +| `--dry-run` | Preview changes without applying them | False | +| `--output-file` | Path to log file | `population_index_reassignment_YYYY-MM-DD_HH-MM-SS.log` | +| `--population` | Process only a specific population set by name | All populations | + +## Log Output + +The command generates a detailed log file containing: + +1. **Per-Statement Details**: + - Statement ID + - Hypothetical index extracted from `curie_id` + - Assigned population index + - Whether it was a conflict winner/loser or bag assignment + +2. **Special Cases**: + - Statements with missing `curie_id` + - Statements where pattern couldn't be matched + - The actual `curie_id` value for debugging + +3. **Summary**: + - Total population sets processed + - Total statements processed + - Total statements reassigned + - Total conflicts resolved + +### Example Log Output + +``` +Population Index Reassignment Report +Generated: 2025-11-05 10:30:45 +Mode: LIVE +================================================================================ + +Processing 2 population set(s)... + +Processing Population Set: rat +-------------------------------------------------------------------------------- +Found 5 exported statement(s) + +Phase 1: Analyzing curie_id patterns... + Statement 101: Found hypothesis index 1 from curie_id + Statement 102: Found hypothesis index 2 from curie_id + Statement 103: Found hypothesis index 2 from curie_id + Statement 104: WARNING - Could not extract index from curie_id: 'invalid format' + Statement 105: WARNING - No curie_id present + +Phase 2: Assigning population indices... + Statement 101: Assigned index 1 + Statement 102: Assigned index 2 (conflict winner) + Statement 103: Moved to bag (conflict loser, had same hypothesis index 2) + +Phase 3: Assigning indices to 2 statement(s) in bag... + Statement 103: Assigned index 3 (from bag) + Statement 104: Assigned index 4 (from bag) + +Phase 4: Updating database... + Statement 101: No change needed (already 1) + Statement 102: No change needed (already 2) + Statement 103: Updated from 2 to 3 + Statement 104: Updated from None to 4 + Population rat: Updated last_used_index from 2 to 4 + +Special Cases (curie_id issues): + Statement 104: Pattern not matched + curie_id: 'invalid format' + Statement 105: Missing curie_id + +Completed population 'rat': + - Statements processed: 5 + - Statements reassigned: 2 + - Conflicts resolved: 1 + - Special cases: 2 + +================================================================================ +SUMMARY +================================================================================ +Total population sets processed: 1 +Total statements processed: 5 +Total statements reassigned: 2 +Total conflicts resolved: 1 +``` + +## Important Notes + +1. **Conflict Resolution**: When multiple statements have the same hypothetical index, the statement with the smallest `id` (earliest creation) takes precedence. + +2. **Transaction Safety**: All database updates are wrapped in a transaction, so either all changes succeed or none are applied. + +3. **Dry Run First**: Always run with `--dry-run` first to preview changes before applying them. + +4. **Special Cases**: Statements without a `curie_id` or with non-matching patterns are logged as special cases and assigned sequential indices. + +5. **Population Name Matching**: The command uses case-insensitive matching for population names and handles special regex characters in population names. + +## When to Use This Command + +- After importing/ingesting statements with `curie_id` values +- To resolve duplicate population indices +- To correct population index assignments after data migrations +- To ensure sequential and conflict-free population indices + diff --git a/applications/composer/backend/composer/management/commands/reassign_population_indices.py b/applications/composer/backend/composer/management/commands/reassign_population_indices.py new file mode 100644 index 00000000..2b93d600 --- /dev/null +++ b/applications/composer/backend/composer/management/commands/reassign_population_indices.py @@ -0,0 +1,304 @@ +import re +from datetime import datetime +from collections import defaultdict +from django.core.management.base import BaseCommand +from django.db import transaction +from composer.models import PopulationSet, ConnectivityStatement + + +class Command(BaseCommand): + help = 'Reassign population indices based on curie_id patterns for exported statements' + + def add_arguments(self, parser): + parser.add_argument( + '--dry-run', + action='store_true', + help='Show what would be changed without making changes', + ) + parser.add_argument( + '--output-file', + type=str, + default=None, + help='Output log file (default: population_index_reassignment_YYYY-MM-DD_HH-MM-SS.log)', + ) + parser.add_argument( + '--population', + type=str, + default=None, + help='Process only a specific population set by name', + ) + + def handle(self, *args, **options): + dry_run = options['dry_run'] + population_filter = options['population'] + + # Generate output filename with timestamp if not provided + if options['output_file']: + output_file = options['output_file'] + else: + timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + mode = "dry_run_" if dry_run else "" + output_file = f'population_index_reassignment_{mode}{timestamp}.log' + + self.stdout.write(self.style.SUCCESS('Starting population index reassignment...')) + + if dry_run: + self.stdout.write(self.style.WARNING('DRY RUN MODE - No changes will be made')) + + try: + with open(output_file, 'w', encoding='utf-8') as log_file: + self.log(log_file, f"Population Index Reassignment Report") + self.log(log_file, f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + self.log(log_file, f"Mode: {'DRY RUN' if dry_run else 'LIVE'}") + self.log(log_file, "=" * 80) + self.log(log_file, "") + + # Get population sets to process + population_sets = PopulationSet.objects.all() + if population_filter: + population_sets = population_sets.filter(name=population_filter.lower()) + if not population_sets.exists(): + raise ValueError(f"Population set '{population_filter}' not found") + + total_populations = population_sets.count() + self.log(log_file, f"Processing {total_populations} population set(s)...") + self.log(log_file, "") + + total_statements_processed = 0 + total_statements_reassigned = 0 + total_conflicts = 0 + + for population in population_sets: + result = self.process_population(population, log_file, dry_run) + total_statements_processed += result['processed'] + total_statements_reassigned += result['reassigned'] + total_conflicts += result['conflicts'] + + # Summary + self.log(log_file, "") + self.log(log_file, "=" * 80) + self.log(log_file, "SUMMARY") + self.log(log_file, "=" * 80) + self.log(log_file, f"Total population sets processed: {total_populations}") + self.log(log_file, f"Total statements processed: {total_statements_processed}") + self.log(log_file, f"Total statements reassigned: {total_statements_reassigned}") + self.log(log_file, f"Total conflicts resolved: {total_conflicts}") + + self.stdout.write( + self.style.SUCCESS( + f'\nProcess completed!\n' + f'Population sets processed: {total_populations}\n' + f'Statements processed: {total_statements_processed}\n' + f'Statements reassigned: {total_statements_reassigned}\n' + f'Conflicts resolved: {total_conflicts}\n' + f'Log written to: {output_file}' + ) + ) + + if dry_run: + self.stdout.write( + self.style.WARNING( + 'Run without --dry-run to apply these changes.' + ) + ) + + except Exception as e: + self.stdout.write(self.style.ERROR(f'Error: {e}')) + raise + + def process_population(self, population, log_file, dry_run): + """Process all exported statements for a given population set""" + self.log(log_file, f"Processing Population Set: {population.name}") + self.log(log_file, "-" * 80) + + # Get all exported statements for this population + statements = ConnectivityStatement.objects.filter( + population=population, + has_statement_been_exported=True + ).order_by('id') + + statement_count = statements.count() + self.log(log_file, f"Found {statement_count} exported statement(s)") + + if statement_count == 0: + self.log(log_file, "No exported statements to process") + self.log(log_file, "") + return {'processed': 0, 'reassigned': 0, 'conflicts': 0} + + # Maps to track population indices + hypothesis_index_map = {} # hypothesis_index -> list of statement objects + statement_assignments = {} # statement_id -> assigned index + bag = [] # Statements that couldn't get hypothesis index or have conflicts + special_cases = [] # Track cases where hypothesis index couldn't be retrieved + + # Create a dictionary for fast statement lookup by id + statements_dict = {s.id: s for s in statements} + + # Pattern to extract population index from curie_id + # Expected format: "neuron type {population_name} {population_index}" + pattern = rf"neuron type {re.escape(population.name)}\s+(\d+)" + compiled_pattern = re.compile(pattern, re.IGNORECASE) + + # First pass: Extract hypothesis indices and detect conflicts + self.log(log_file, "") + self.log(log_file, "Phase 1: Analyzing curie_id patterns...") + + for statement in statements: + hypothesis_index = None + + if statement.curie_id: + match = compiled_pattern.search(statement.curie_id) + if match: + hypothesis_index = int(match.group(1)) + self.log(log_file, f" Statement {statement.curie_id}: Found hypothesis index {hypothesis_index} from curie_id") + else: + self.log(log_file, f" WARNING - Could not extract index from curie_id: '{statement.curie_id}'") + special_cases.append({ + 'statement_id': statement.id, + 'curie_id': statement.curie_id, + 'reason': 'Pattern not matched' + }) + else: + self.log(log_file, f" Statement {statement.id}: WARNING - No curie_id present") + special_cases.append({ + 'statement_id': statement.id, + 'curie_id': None, + 'reason': 'Missing curie_id' + }) + + if hypothesis_index is not None: + if hypothesis_index not in hypothesis_index_map: + hypothesis_index_map[hypothesis_index] = [] + hypothesis_index_map[hypothesis_index].append(statement) + else: + bag.append(statement) + + # Second pass: Assign indices, handling conflicts + self.log(log_file, "") + self.log(log_file, "Phase 2: Assigning population indices...") + + conflicts_resolved = 0 + used_indices = set() + + # Process statements with hypothesis indices + for hypothesis_index in sorted(hypothesis_index_map.keys()): + statements_list = hypothesis_index_map[hypothesis_index] + + if len(statements_list) == 1: + # No conflict - assign directly + statement = statements_list[0] + statement_assignments[statement.id] = hypothesis_index + used_indices.add(hypothesis_index) + self.log(log_file, f" Statement {statement.curie_id}: Assigned index {hypothesis_index}") + else: + # Conflict - earliest statement (smallest id) gets the index + statements_list.sort(key=lambda s: s.id) + winner = statements_list[0] + losers = statements_list[1:] + + statement_assignments[winner.id] = hypothesis_index + used_indices.add(hypothesis_index) + conflicts_resolved += len(losers) + + self.log(log_file, f" Statement {winner.curie_id}: Assigned index {hypothesis_index} (conflict winner)") + for loser in losers: + self.log(log_file, f" Statement {loser.curie_id}: Moved to bag (conflict loser, had same hypothesis index {hypothesis_index})") + bag.append(loser) + + # Third pass: Assign indices to bag statements sequentially + if bag: + self.log(log_file, "") + self.log(log_file, f"Phase 3: Assigning indices to {len(bag)} statement(s) in bag...") + + # Find the next available index + if used_indices: + next_index = max(used_indices) + 1 + else: + next_index = 1 + + # Sort bag by statement id to maintain consistent ordering + bag.sort(key=lambda s: s.id) + + for statement in bag: + # Find next unused index + while next_index in used_indices: + next_index += 1 + + statement_assignments[statement.id] = next_index + used_indices.add(next_index) + self.log(log_file, f" Statement {statement.curie_id}: Assigned index {next_index} (from bag)") + next_index += 1 + + # Apply changes to database + statements_reassigned = 0 + + if not dry_run: + self.log(log_file, "") + self.log(log_file, "Phase 4: Updating database...") + + with transaction.atomic(): + for statement_id, new_index in statement_assignments.items(): + statement = statements_dict[statement_id] + old_index = statement.population_index + + if old_index != new_index: + statement.population_index = new_index + statement.save(update_fields=['population_index']) + statements_reassigned += 1 + self.log(log_file, f" Statement {statement.curie_id}: Updated from {old_index} to {new_index}") + else: + self.log(log_file, f" Statement {statement.curie_id}: No change needed (already {new_index})") + + # Update population's last_used_index + if used_indices: + new_last_index = max(used_indices) + old_last_index = population.last_used_index + population.last_used_index = new_last_index + population.save(update_fields=['last_used_index']) + self.log(log_file, f" Population {population.name}: Updated last_used_index from {old_last_index} to {new_last_index}") + else: + # In dry run, just report what would change + self.log(log_file, "") + self.log(log_file, "Phase 4: Database changes (DRY RUN - not applied)...") + + for statement_id, new_index in statement_assignments.items(): + statement = statements_dict[statement_id] + old_index = statement.population_index + + if old_index != new_index: + statements_reassigned += 1 + self.log(log_file, f" Statement {statement_id}: Would update from {old_index} to {new_index}") + else: + self.log(log_file, f" Statement {statement_id}: No change needed (already {new_index})") + + if used_indices: + new_last_index = max(used_indices) + self.log(log_file, f" Population {population.name}: Would update last_used_index to {new_last_index}") + + # Report special cases + if special_cases: + self.log(log_file, "") + self.log(log_file, "Special Cases (curie_id issues):") + for case in special_cases: + self.log(log_file, f" Statement {case['statement_id']}: {case['reason']}") + if case['curie_id']: + self.log(log_file, f" curie_id: '{case['curie_id']}'") + + self.log(log_file, "") + self.log(log_file, f"Completed population '{population.name}':") + self.log(log_file, f" - Statements processed: {statement_count}") + self.log(log_file, f" - Statements reassigned: {statements_reassigned}") + self.log(log_file, f" - Conflicts resolved: {conflicts_resolved}") + self.log(log_file, f" - Special cases: {len(special_cases)}") + self.log(log_file, "") + + return { + 'processed': statement_count, + 'reassigned': statements_reassigned, + 'conflicts': conflicts_resolved + } + + def log(self, file_handle, message): + """Write to both log file and stdout""" + file_handle.write(message + '\n') + self.stdout.write(message) diff --git a/applications/composer/backend/composer/models.py b/applications/composer/backend/composer/models.py index 62474212..a02b8538 100644 --- a/applications/composer/backend/composer/models.py +++ b/applications/composer/backend/composer/models.py @@ -895,8 +895,7 @@ def system_exported(self, *args, **kwargs): permission=ConnectivityStatementStateService.has_permission_to_transition_to_invalid, ) def invalid(self, *args, **kwargs): - self.has_statement_been_exported = True - self.save(update_fields = ["has_statement_been_exported"]) + self._perform_export_logic() @transition( field=state, diff --git a/applications/composer/backend/composer/services/cs_ingestion/helpers/statement_helper.py b/applications/composer/backend/composer/services/cs_ingestion/helpers/statement_helper.py index 7c72ab9c..c13ae752 100644 --- a/applications/composer/backend/composer/services/cs_ingestion/helpers/statement_helper.py +++ b/applications/composer/backend/composer/services/cs_ingestion/helpers/statement_helper.py @@ -97,7 +97,7 @@ def create_or_update_connectivity_statement( "population": get_or_create_populationset(populationset_name), "projection_phenotype": get_projection_phenotype(statement), "reference_uri": statement[ID], - "state": CSState.EXPORTED, + "state": CSState.NPO_APPROVED, "curie_id": statement[LABEL], } diff --git a/applications/composer/backend/composer/signals.py b/applications/composer/backend/composer/signals.py index 666a14ac..05fb2501 100644 --- a/applications/composer/backend/composer/signals.py +++ b/applications/composer/backend/composer/signals.py @@ -49,12 +49,20 @@ def post_transition_callback(sender, instance, name, source, target, **kwargs): sentence = instance else: sentence = None + + # Customize message based on whether transition was done by system (ingestion) or user + if user and user.username == "system": + note_message = f"Automatically transitioned from {source} to {target} during automated processes (e.g., ingestion)." + else: + user_name = f"{user.first_name} {user.last_name}" if user else "Unknown user" + note_message = f"User {user_name} transitioned this record from {source} to {target}" + Note.objects.create( user=system_user, type=NoteType.TRANSITION, connectivity_statement=connectivity_statement, sentence=sentence, - note=f"User {user.first_name} {user.last_name} transitioned this record from {source} to {target}", + note=note_message, ) diff --git a/applications/composer/backend/requirements.txt b/applications/composer/backend/requirements.txt index 34255e49..3de48054 100644 --- a/applications/composer/backend/requirements.txt +++ b/applications/composer/backend/requirements.txt @@ -1,5 +1,5 @@ packaging==21.3 -Django==4.2.25 +Django==4.2.26 uvicorn==0.20.0 starlette>=0.41.3 pillow>=11.0.0 diff --git a/deployment/codefresh-stage.yaml b/deployment/codefresh-stage.yaml index 105e2e97..8ac3c74c 100644 --- a/deployment/codefresh-stage.yaml +++ b/deployment/codefresh-stage.yaml @@ -34,7 +34,8 @@ steps: commands: - bash cloud-harness/install.sh - harness-deployment cloud-harness . -t ${{DEPLOYMENT_TAG}} -d ${{DOMAIN}} -r - ${{REGISTRY}} -rs ${{REGISTRY_SECRET}} -n ${{NAMESPACE}} -e stage -i composer + ${{REGISTRY}} -rs '${{REGISTRY_SECRET}}' -n ${{NAMESPACE}} -e stage --no-cd -i + composer prepare_deployment_view: commands: - helm template ./deployment/helm --debug -n ${{NAMESPACE}} @@ -62,6 +63,17 @@ steps: custom_values: - apps_composer_harness_secrets_SECRET__KEY="${{SECRET__KEY}}" - apps_composer_harness_secrets_SOCIAL__AUTH__ORCID__SECRET="${{SOCIAL__AUTH__ORCID__SECRET}}" + wait_deployment: + stage: qa + title: Wait deployment to be ready + image: codefresh/kubectl + commands: + - kubectl config use-context ${{CLUSTER_NAME}} + - kubectl config set-context --current --namespace=${{NAMESPACE}} + - kubectl rollout status deployment/argo-gk + - kubectl rollout status deployment/accounts + - kubectl rollout status deployment/composer + - sleep 60 manual_tests: type: pending-approval stage: publish @@ -86,16 +98,16 @@ steps: stage: publish type: push title: Cloudharness frontend build - candidate: '${{REGISTRY}}/cloudharness/cloudharness-frontend-build:${{DEPLOYMENT_TAG}}' + candidate: '${{REGISTRY}}/cloud-harness/cloudharness-frontend-build:${{DEPLOYMENT_TAG}}' tags: - '${{DEPLOYMENT_PUBLISH_TAG}}' - latest registry: '${{REGISTRY_PUBLISH_URL}}' - publish_cloudharness-base-debian: + publish_cloudharness-base: stage: publish type: push - title: Cloudharness base debian - candidate: '${{REGISTRY}}/cloudharness/cloudharness-base-debian:${{DEPLOYMENT_TAG}}' + title: Cloudharness base + candidate: '${{REGISTRY}}/cloud-harness/cloudharness-base:${{DEPLOYMENT_TAG}}' tags: - '${{DEPLOYMENT_PUBLISH_TAG}}' - latest @@ -104,7 +116,7 @@ steps: stage: publish type: push title: Cloudharness django - candidate: '${{REGISTRY}}/cloudharness/cloudharness-django:${{DEPLOYMENT_TAG}}' + candidate: '${{REGISTRY}}/cloud-harness/cloudharness-django:${{DEPLOYMENT_TAG}}' tags: - '${{DEPLOYMENT_PUBLISH_TAG}}' - latest @@ -113,7 +125,7 @@ steps: stage: publish type: push title: Accounts - candidate: '${{REGISTRY}}/cloudharness/accounts:${{DEPLOYMENT_TAG}}' + candidate: '${{REGISTRY}}/cloud-harness/accounts:${{DEPLOYMENT_TAG}}' tags: - '${{DEPLOYMENT_PUBLISH_TAG}}' - latest @@ -122,7 +134,16 @@ steps: stage: publish type: push title: Composer - candidate: '${{REGISTRY}}/cloudharness/composer:${{DEPLOYMENT_TAG}}' + candidate: '${{REGISTRY}}/cloud-harness/composer:${{DEPLOYMENT_TAG}}' + tags: + - '${{DEPLOYMENT_PUBLISH_TAG}}' + - latest + registry: '${{REGISTRY_PUBLISH_URL}}' + publish_composer-neurondm: + stage: publish + type: push + title: Composer neurondm + candidate: '${{REGISTRY}}/cloud-harness/composer-neurondm:${{DEPLOYMENT_TAG}}' tags: - '${{DEPLOYMENT_PUBLISH_TAG}}' - latest @@ -131,7 +152,7 @@ steps: stage: publish type: push title: Composer notify - candidate: '${{REGISTRY}}/cloudharness/composer-notify:${{DEPLOYMENT_TAG}}' + candidate: '${{REGISTRY}}/cloud-harness/composer-notify:${{DEPLOYMENT_TAG}}' tags: - '${{DEPLOYMENT_PUBLISH_TAG}}' - latest