Skip to content

Commit dcb6bd4

Browse files
authored
Merge pull request #11671 from mkovalua/fix/ENG-10028-pbs-26-6
[ENG-10028] SHARE is not consistently indexing OSF content
2 parents 8c5c641 + 75d571e commit dcb6bd4

14 files changed

Lines changed: 329 additions & 15 deletions

File tree

admin/base/urls.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
re_path(r'^cedar_metadata_templates/', include('admin.cedar.urls', namespace='cedar_metadata_templates')),
3838
re_path(r'^draft_registrations/', include('admin.draft_registrations.urls', namespace='draft_registrations')),
3939
re_path(r'^files/', include('admin.files.urls', namespace='files')),
40+
re_path(r'^share_reindex/', include('admin.share_reindex.urls', namespace='share_reindex')),
4041
]),
4142
),
4243
]

admin/share_reindex/__init__.py

Whitespace-only changes.

admin/share_reindex/urls.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from django.urls import re_path
2+
from . import views
3+
4+
app_name = 'admin'
5+
6+
urlpatterns = [
7+
re_path(r'^$', views.FailedShareIndexedGuidList.as_view(), name='list'),
8+
re_path(r'^(?P<resource_type>[^/]+)/$', views.FailedShareIndexedGuidReindex.as_view(), name='reindex-share-resource'),
9+
]

admin/share_reindex/views.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from django.contrib.auth.mixins import PermissionRequiredMixin
2+
from django.urls import reverse
3+
from django.shortcuts import redirect
4+
from django.views.generic import ListView, View
5+
from osf.models import Guid
6+
from urllib.parse import urlencode
7+
from api.share.utils import get_not_indexed_guids_for_resource_with_no_indexed_guid, task__reindex_failed_or_not_indexed_resource_into_share
8+
9+
class FailedShareIndexedGuidList(PermissionRequiredMixin, ListView):
10+
paginate_by = 25
11+
template_name = 'share_reindex/list.html'
12+
permission_required = 'osf.update_share_reindex'
13+
raise_exception = True
14+
model = Guid
15+
16+
def get_queryset(self):
17+
resource_type = self.request.GET.get('type', 'projects')
18+
return get_not_indexed_guids_for_resource_with_no_indexed_guid(resource_type)
19+
20+
def get_context_data(self, **kwargs):
21+
query_set = kwargs.pop('object_list', self.object_list)
22+
page_size = self.get_paginate_by(query_set)
23+
paginator, page, query_set, is_paginated = self.paginate_queryset(query_set, page_size)
24+
kwargs.setdefault('items_to_index', query_set)
25+
kwargs.setdefault('page', page)
26+
resource_type = self.request.GET.get('type', 'projects')
27+
kwargs.setdefault('selected_resource_type', resource_type)
28+
resource_type_detail_mapping = {
29+
'users': 'users:user', 'preprints': 'preprints:preprint', 'registries': 'nodes:node', 'projects': 'nodes:node', 'files': 'files:file'
30+
}
31+
32+
kwargs.setdefault('resource_detail', resource_type_detail_mapping.get(resource_type))
33+
resource_type_guid_reindex = {
34+
'users': 'users:reindex-share-user', 'preprints': 'preprints:reindex-share-preprint', 'registries': 'nodes:reindex-share-node', 'projects': 'nodes:reindex-share-node'
35+
}
36+
kwargs.setdefault('resource_guid_reindex', resource_type_guid_reindex.get(resource_type))
37+
status_msg = f'Reindex of {resource_type} started, please check later.' if self.request.GET.get('status') == 'indexing' else ''
38+
kwargs.setdefault('share_reindex_message', status_msg)
39+
return super().get_context_data(**kwargs)
40+
41+
42+
class FailedShareIndexedGuidReindex(PermissionRequiredMixin, View):
43+
permission_required = 'osf.update_share_reindex'
44+
raise_exception = True
45+
46+
def post(self, request, *args, **kwargs):
47+
resource_type = self.kwargs.get('resource_type')
48+
# reindex 100_000 guids in background task for specific resource_type and resource is public
49+
task__reindex_failed_or_not_indexed_resource_into_share.delay(resource_type)
50+
base_url = reverse('share_reindex:list')
51+
query_string = urlencode({'type': resource_type, 'status': 'indexing'})
52+
return redirect(f"{base_url}?{query_string}")

admin/templates/base.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,9 @@
317317
{% if perms.osf.change_cedarmetadatatemplate %}
318318
<li><a href="{% url 'cedar_metadata_templates:list' %}"><i class='fa fa-link'></i> <span>Cedar Metadata Templates</span></a></li>
319319
{% endif %}
320+
{% if perms.osf.update_share_reindex %}
321+
<li><a href="{% url 'share_reindex:list' %}"><i class='fa fa-link'></i> <span>Share Reindex</span></a></li>
322+
{% endif %}
320323
{% if perms.osf.change_maintenancestate %}
321324
<li><a href="{% url 'maintenance:display' %}"><i class='fa fa-link'></i> <span>Maintenance Alerts</span></a></li>
322325
{% endif %}
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
{% extends "base.html" %}
2+
{% load render_bundle from webpack_loader %}
3+
{% load comment_extras %}
4+
5+
{% load static %}
6+
{% block top_includes %}
7+
<link rel="stylesheet" type="text/css" href="/static/css/institutions.css" />
8+
{% endblock %}
9+
{% block title %}
10+
<title>Share Reindex</title>
11+
{% endblock title %}
12+
{% block content %}
13+
<h2>Share Reindex</h2>
14+
15+
{% include "util/pagination.html" with items=page extra_query_params="&type="|add:selected_resource_type %}
16+
17+
18+
<div class="row" style="margin-bottom: 20px;">
19+
<div class="col-md-3">
20+
<form method="GET" action="">
21+
<select class="form-control" name="type" onchange="this.form.submit()">
22+
<option value="projects" {% if selected_resource_type == 'projects' %}selected{% endif %}>Projects</option>
23+
<option value="preprints" {% if selected_resource_type == 'preprints' %}selected{% endif %}>Preprints</option>
24+
<option value="registries" {% if selected_resource_type == 'registries' %}selected{% endif %}>Registries</option>
25+
<option value="users" {% if selected_resource_type == 'users' %}selected{% endif %}>Users</option>
26+
<option value="files" {% if selected_resource_type == 'files' %}selected{% endif %}>Files</option>
27+
</select>
28+
29+
</form>
30+
</div>
31+
<div class="col-md-3">
32+
<a data-toggle="modal" data-target="#confirmReindexShareNodes" class="btn btn-primary">
33+
SHARE Reindex All {{selected_resource_type}}
34+
</a>
35+
36+
<div class="modal" id="confirmReindexShareNodes">
37+
<div class="modal-dialog">
38+
<div class="modal-content">
39+
40+
<form method="post" action="{% url 'admin:reindex-share-resource' resource_type=selected_resource_type %}">
41+
{% csrf_token %}
42+
<div class="modal-header">
43+
<button type="button" class="close" data-dismiss="modal">×</button>
44+
<h3>Are you sure you want to reindex {{selected_resource_type}} (SHARE)?</h3>
45+
</div>
46+
47+
<div class="modal-footer">
48+
<button type="button" class="btn btn-default" data-dismiss="modal">Cancel</button>
49+
<input class="btn btn-primary" type="submit" value="Confirm Re-index" />
50+
</div>
51+
</form>
52+
53+
</div>
54+
</div>
55+
</div>
56+
</div>
57+
</div>
58+
59+
<div>
60+
<p>{{share_reindex_message}}</p>
61+
</div>
62+
63+
64+
<table class="table table-striped table-hover table-responsive">
65+
<thead>
66+
<tr>
67+
<th>Guid</th>
68+
{% if selected_resource_type == 'projects' or selected_resource_type == 'preprints' or selected_resource_type == 'registries' %}
69+
<th>Title</th>
70+
{% elif selected_resource_type == 'users' %}
71+
<th>Fullname</th>
72+
{% else %}
73+
<th>Name</th>
74+
{% endif %}
75+
<th>Datetime Last Indexed</th>
76+
<!-- there is no a file indexing option for detail page for now -->
77+
{% if selected_resource_type != 'files' %}
78+
<th>Reindex</th>
79+
{% endif %}
80+
</tr>
81+
</thead>
82+
<tbody>
83+
{% for item in items_to_index %}
84+
<tr>
85+
<td>
86+
<a href="{% url resource_detail guid=item.first_guid %}">
87+
{{item.first_guid}}
88+
</a>
89+
</td>
90+
{% if selected_resource_type == 'projects' or selected_resource_type == 'preprints' or selected_resource_type == 'registries' %}
91+
<td>{{item.title}}</td>
92+
{% elif selected_resource_type == 'users' %}
93+
<td>{{item.fullname}}</td>
94+
{% else %}
95+
<th>{{item.name}}</th>
96+
{% endif %}
97+
98+
<td>{{item.date_last_indexed}}</td>
99+
100+
{% if selected_resource_type != 'files' %}
101+
<td>
102+
<a data-toggle="modal" data-target="#confirmReindexShareNode-{{ item.first_guid }}" class="btn btn-primary">SHARE Reindex</a>
103+
</td>
104+
<div class="modal" id="confirmReindexShareNode-{{ item.first_guid }}">
105+
<div class="modal-dialog">
106+
<div class="modal-content">
107+
<form class="well" method="post" action="{% url resource_guid_reindex guid=item.first_guid %}">
108+
<div class="modal-header">
109+
<button type="button" class="close" data-dismiss="modal">x</button>
110+
<h3>Are you sure you want to reindex this node (SHARE)? {{ item.first_guid }}</h3>
111+
</div>
112+
{% csrf_token %}
113+
<div class="modal-footer">
114+
<input class="btn btn-danger" type="submit" value="Confirm" />
115+
<button type="button" class="btn btn-default" data-dismiss="modal">
116+
Cancel
117+
</button>
118+
</div>
119+
</form>
120+
121+
</div>
122+
{# Data from above link #}
123+
</div>
124+
</div>
125+
{% endif %}
126+
127+
128+
</tr>
129+
{% endfor %}
130+
</tbody>
131+
</table>
132+
133+
{% endblock content %}

admin/templates/util/pagination.html

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
<div class="pagination pagination-lg">
44
<span>
55
{% if items.has_previous %}
6-
<a href="?page=1&amp;status={{ status }}&amp;p={{ pagin }}&amp;order_by={{ order }}"
6+
<a href="?page=1&amp;status={{ status }}&amp;p={{ pagin }}&amp;order_by={{ order }}{{ extra_query_params }}"
77
class="btn btn-primary">
88
|
99
</a>
10-
<a href="?page={{ items.previous_page_number }}&amp;status={{ status }}&amp;p={{ pagin }}&amp;order_by={{ order }}"
10+
<a href="?page={{ items.previous_page_number }}&amp;status={{ status }}&amp;p={{ pagin }}&amp;order_by={{ order }}{{ extra_query_params }}"
1111
class="btn btn-primary">
1212
<i class="fa fa-angle-left"></i>
1313
</a>
@@ -25,11 +25,11 @@
2525
</span>
2626

2727
{% if items.has_next %}
28-
<a href="?page={{ items.next_page_number }}&amp;status={{ status }}&amp;p={{ pagin }}&amp;order_by={{ order }}"
28+
<a href="?page={{ items.next_page_number }}&amp;status={{ status }}&amp;p={{ pagin }}&amp;order_by={{ order }}{{ extra_query_params }}"
2929
class="btn btn-primary">
3030
<i class="fa fa-angle-right"></i>
3131
</a>
32-
<a href="?page={{ items.paginator.num_pages }}&amp;status={{ status }}&amp;p={{ pagin }}&amp;order_by={{ order }}"
32+
<a href="?page={{ items.paginator.num_pages }}&amp;status={{ status }}&amp;p={{ pagin }}&amp;order_by={{ order }}{{ extra_query_params }}"
3333
class="btn btn-primary">
3434
|
3535
</a>
@@ -44,11 +44,11 @@
4444
</span>
4545
{% if pagin %}
4646
<span>
47-
<a href="?p=10&amp;order_by={{ order }}&amp;status={{ status }}"
47+
<a href="?p=10&amp;order_by={{ order }}&amp;status={{ status }}{{ extra_query_params }}"
4848
class="btn btn-primary">10</a>
49-
<a href="?p=25&amp;order_by={{ order }}&amp;status={{ status }}"
49+
<a href="?p=25&amp;order_by={{ order }}&amp;status={{ status }}{{ extra_query_params }}"
5050
class="btn btn-primary">25</a>
51-
<a href="?p=50&amp;order_by={{ order }}&amp;status={{ status }}"
51+
<a href="?p=50&amp;order_by={{ order }}&amp;status={{ status }}{{ extra_query_params }}"
5252
class="btn btn-primary">50</a>
5353
</span>
5454
{% endif %}

api/share/utils.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@
66
import logging
77

88
from django.apps import apps
9+
from django.db.models import Q, OuterRef, Subquery
10+
from django.contrib.contenttypes.models import ContentType
911
from celery.utils.time import get_exponential_backoff_interval
1012
import requests
1113

14+
1215
from framework.celery_tasks import app as celery_app
1316
from framework.celery_tasks.handlers import enqueue_task
1417
from framework.encryption import ensure_bytes
@@ -80,6 +83,7 @@ def task__update_share(self, guid: str, is_backfill=False, osfmap_partition_name
8083
raise ValueError(f'unknown osfguid "{guid}"')
8184
_resource = _osfid_instance.referent
8285
_is_deletion = _should_delete_indexcard(_resource)
86+
_resource.mark_indexing_failed()
8387
try:
8488
_response = (
8589
pls_delete_trove_record(_resource, osfmap_partition=_osfmap_partition)
@@ -115,6 +119,7 @@ def task__update_share(self, guid: str, is_backfill=False, osfmap_partition_name
115119
if HTTPStatus(_response.status_code).is_server_error:
116120
raise self.retry(exc=e)
117121
else: # success response
122+
_resource.mark_indexing_success()
118123
if not _is_deletion:
119124
# enqueue followup task for supplementary metadata
120125
_next_partition = _next_osfmap_partition(_osfmap_partition)
@@ -126,6 +131,39 @@ def task__update_share(self, guid: str, is_backfill=False, osfmap_partition_name
126131
)
127132

128133

134+
@celery_app.task
135+
def task__reindex_failed_or_not_indexed_resource_into_share(resource_type: str, start_id: int = 0, chunk_count: int = 200, chunk_size: int = 500):
136+
from osf.management.commands.recatalog_metadata import recatalog
137+
queryset = get_not_indexed_guids_for_resource_with_no_indexed_guid(resource_type, only_oldest_guid=False)
138+
# chunk count and chunk size up to discussion what will be better with Cloud Team
139+
recatalog(queryset, start_id, chunk_count, chunk_size)
140+
141+
142+
def get_not_indexed_guids_for_resource_with_no_indexed_guid(resource_type: str, only_oldest_guid: bool = True):
143+
from osf.models import Guid, Registration, Preprint, Node, OSFUser
144+
from addons.osfstorage.models import OsfStorageFile
145+
common_not_indexed_public_resource_extract_query = (
146+
Q(is_public=True) & Q(deleted__isnull=True) &
147+
(Q(has_been_indexed=False) | Q(has_been_indexed__isnull=True))
148+
)
149+
resource_mapper = {
150+
'projects': (Node, common_not_indexed_public_resource_extract_query, ('first_guid', 'date_last_indexed', 'title')),
151+
'preprints': (Preprint, common_not_indexed_public_resource_extract_query & Q(is_published=True), ('first_guid', 'date_last_indexed', 'title')),
152+
'registries': (Registration, common_not_indexed_public_resource_extract_query, ('first_guid', 'date_last_indexed', 'title')),
153+
'users': (OSFUser, Q(is_active=True) & Q(deleted__isnull=True) & (Q(has_been_indexed=False) | Q(has_been_indexed__isnull=True)), ('first_guid', 'fullname', 'date_last_indexed')),
154+
'files': (OsfStorageFile, Q(deleted__isnull=True), ('first_guid', 'name', 'date_last_indexed')),
155+
}
156+
resource_model, query, values_to_return = resource_mapper.get(resource_type, 'projects')
157+
if only_oldest_guid:
158+
model_content_type = ContentType.objects.get_for_model(resource_model)
159+
first_guid_sq = Guid.objects.filter(
160+
content_type=model_content_type,
161+
object_id=OuterRef('pk'),
162+
).order_by('created').values('_id')[:1]
163+
return resource_model.objects.filter(query).annotate(first_guid=Subquery(first_guid_sq)).exclude(first_guid__isnull=True).values(*values_to_return)
164+
return resource_model.objects.filter(query)
165+
166+
129167
def pls_send_trove_record(osf_item, *, is_backfill: bool, osfmap_partition: OsfmapPartition):
130168
try:
131169
_iri = osf_item.get_semantic_iri()
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Generated by Django 4.2.26 on 2026-03-31 15:44
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('osf', '0037_notification_refactor_post_release'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='abstractnode',
15+
name='date_last_indexed',
16+
field=models.DateTimeField(blank=True, null=True),
17+
),
18+
migrations.AddField(
19+
model_name='abstractnode',
20+
name='has_been_indexed',
21+
field=models.BooleanField(blank=True, db_index=True, default=None, null=True),
22+
),
23+
migrations.AddField(
24+
model_name='basefilenode',
25+
name='date_last_indexed',
26+
field=models.DateTimeField(blank=True, null=True),
27+
),
28+
migrations.AddField(
29+
model_name='basefilenode',
30+
name='has_been_indexed',
31+
field=models.BooleanField(blank=True, db_index=True, default=None, null=True),
32+
),
33+
migrations.AddField(
34+
model_name='osfuser',
35+
name='date_last_indexed',
36+
field=models.DateTimeField(blank=True, null=True),
37+
),
38+
migrations.AddField(
39+
model_name='osfuser',
40+
name='has_been_indexed',
41+
field=models.BooleanField(blank=True, db_index=True, default=None, null=True),
42+
),
43+
migrations.AddField(
44+
model_name='preprint',
45+
name='date_last_indexed',
46+
field=models.DateTimeField(blank=True, null=True),
47+
),
48+
migrations.AddField(
49+
model_name='preprint',
50+
name='has_been_indexed',
51+
field=models.BooleanField(blank=True, db_index=True, default=None, null=True),
52+
),
53+
]

osf/models/files.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from framework import sentry
1818
from .base import BaseModel, OptionalGuidMixin, ObjectIDMixin
1919
from .comment import CommentableMixin
20-
from .mixins import Taggable
20+
from .mixins import Taggable, ShareIndexMixin
2121
from .validators import validate_location
2222
from osf.utils.datetime_aware_jsonfield import DateTimeAwareJSONField
2323
from osf.utils.fields import NonNaiveDateTimeField
@@ -64,7 +64,7 @@ class UnableToResolveFileClass(Exception):
6464
pass
6565

6666

67-
class BaseFileNode(TypedModel, CommentableMixin, OptionalGuidMixin, Taggable, ObjectIDMixin, BaseModel):
67+
class BaseFileNode(TypedModel, CommentableMixin, OptionalGuidMixin, Taggable, ObjectIDMixin, ShareIndexMixin, BaseModel):
6868
"""Base class for all provider-specific file models and the trashed file model.
6969
This class should generally not be used or created manually. Use the provider-specific
7070
subclasses instead.

0 commit comments

Comments
 (0)