From 48a5dbe698e81f5c6cc71ceb83a071ceaedcbdb1 Mon Sep 17 00:00:00 2001 From: Shankar Ambady Date: Wed, 29 Apr 2026 08:58:02 -0400 Subject: [PATCH 1/4] Vector search pagination fix for empty hybrid searches (#3251) * adding stopgap pagination fix * Update vector_search/views.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * adding limits to offset and pagination + tests * fix min value * update spec * fix spec * fix spec --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- openapi/specs/v0.yaml | 8 +++ vector_search/constants.py | 3 + vector_search/serializers.py | 39 ++++++++++-- vector_search/serializers_test.py | 46 +++++++++++++++ vector_search/views.py | 26 ++++++-- vector_search/views_test.py | 98 +++++++++++++++++++++++++++++++ 6 files changed, 212 insertions(+), 8 deletions(-) diff --git a/openapi/specs/v0.yaml b/openapi/specs/v0.yaml index 2a65b6cd16..b18dda0d69 100644 --- a/openapi/specs/v0.yaml +++ b/openapi/specs/v0.yaml @@ -923,6 +923,8 @@ paths: name: limit schema: type: integer + maximum: 1000 + minimum: 1 description: Number of results to return per page - in: query name: offered_by @@ -936,6 +938,8 @@ paths: name: offset schema: type: integer + maximum: 1000 + minimum: 0 description: The initial index from which to return the results - in: query name: platform @@ -1247,6 +1251,8 @@ paths: name: limit schema: type: integer + maximum: 1000 + minimum: 1 description: Number of results to return per page - in: query name: ocw_topic @@ -1286,6 +1292,8 @@ paths: name: offset schema: type: integer + maximum: 1000 + minimum: 0 description: The initial index from which to return the results - in: query name: platform diff --git a/vector_search/constants.py b/vector_search/constants.py index 5aa7223a57..bf2deb09de 100644 --- a/vector_search/constants.py +++ b/vector_search/constants.py @@ -139,3 +139,6 @@ TOPICS_COLLECTION_NAME: QDRANT_TOPICS_PARAM_MAP, CONTENT_FILES_COLLECTION_NAME: QDRANT_CONTENT_FILE_PARAM_MAP, } + +# Maximum value of offset + limit accepted by paginated vector search +MAX_RESULT_WINDOW = 1000 diff --git a/vector_search/serializers.py b/vector_search/serializers.py index 47aee71862..6e66a7687a 100644 --- a/vector_search/serializers.py +++ b/vector_search/serializers.py @@ -20,12 +20,25 @@ SearchResponseSerializer, ) from vector_search.constants import ( + MAX_RESULT_WINDOW, QDRANT_CONTENT_FILE_PARAM_MAP, QDRANT_LEARNING_RESOURCE_SORTBY_FIELDS, QDRANT_RESOURCE_PARAM_MAP, ) +def _validate_result_window(attrs): + offset = attrs.get("offset") or 0 + limit = attrs.get("limit") or 0 + if offset + limit > MAX_RESULT_WINDOW: + msg = ( + f"offset + limit must not exceed {MAX_RESULT_WINDOW}. " + "Use more specific filters to narrow results." + ) + raise serializers.ValidationError(msg) + return attrs + + class LearningResourcesSearchFiltersSerializer(serializers.Serializer): """ Shared filter fields for Qdrant-backed learning resource queries. @@ -195,10 +208,16 @@ class LearningResourcesVectorSearchRequestSerializer( ) q = serializers.CharField(required=False, help_text="The search text") offset = serializers.IntegerField( - required=False, help_text="The initial index from which to return the results" + required=False, + help_text="The initial index from which to return the results", + min_value=0, + max_value=MAX_RESULT_WINDOW, ) limit = serializers.IntegerField( - required=False, help_text="Number of results to return per page" + required=False, + help_text="Number of results to return per page", + min_value=1, + max_value=MAX_RESULT_WINDOW, ) hybrid_search = serializers.BooleanField( required=False, @@ -206,6 +225,9 @@ class LearningResourcesVectorSearchRequestSerializer( help_text="Whether to use a hybrid search", ) + def validate(self, attrs): + return _validate_result_window(attrs) + class LearningResourcesVectorSearchResponseSerializer(SearchResponseSerializer): """ @@ -233,10 +255,16 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): q = serializers.CharField(required=False, help_text="The search text") offset = serializers.IntegerField( - required=False, help_text="The initial index from which to return the results" + required=False, + help_text="The initial index from which to return the results", + min_value=0, + max_value=MAX_RESULT_WINDOW, ) limit = serializers.IntegerField( - required=False, help_text="Number of results to return per page" + required=False, + help_text="Number of results to return per page", + min_value=1, + max_value=MAX_RESULT_WINDOW, ) aggregation_choices = [ (key, key.replace("_", " ").title()) for key in QDRANT_CONTENT_FILE_PARAM_MAP @@ -315,6 +343,9 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): help_text="Whether to use a hybrid search", ) + def validate(self, attrs): + return _validate_result_window(attrs) + class ContentFileVectorSearchResponseSerializer(SearchResponseSerializer): """ diff --git a/vector_search/serializers_test.py b/vector_search/serializers_test.py index c7e61bb9cd..14266cc01b 100644 --- a/vector_search/serializers_test.py +++ b/vector_search/serializers_test.py @@ -1,4 +1,6 @@ +from vector_search.constants import MAX_RESULT_WINDOW from vector_search.serializers import ( + ContentFileVectorSearchRequestSerializer, LearningResourcesSearchFiltersSerializer, LearningResourcesVectorSearchRequestSerializer, ) @@ -37,3 +39,47 @@ def test_vector_search_request_serializer_inherits_filter_fields(): assert "platform" in fields assert "q" in fields assert "hybrid_search" in fields + + +def test_vector_search_result_window_validation(): + """Test that the result window (offset + limit) is validated.""" + # Valid window + data = {"offset": 10, "limit": 10} + s = LearningResourcesVectorSearchRequestSerializer(data=data) + assert s.is_valid(), s.errors + + # Invalid window: offset + limit > MAX_RESULT_WINDOW + data = {"offset": MAX_RESULT_WINDOW, "limit": 1} + s = LearningResourcesVectorSearchRequestSerializer(data=data) + assert not s.is_valid() + assert ( + f"offset + limit must not exceed {MAX_RESULT_WINDOW}" + in s.errors["non_field_errors"][0] + ) + + # Boundary case: offset + limit == MAX_RESULT_WINDOW + data = {"offset": MAX_RESULT_WINDOW - 10, "limit": 10} + s = LearningResourcesVectorSearchRequestSerializer(data=data) + assert s.is_valid(), s.errors + + +def test_content_file_vector_search_result_window_validation(): + """Test that the result window (offset + limit) is validated for content files.""" + # Valid window + data = {"offset": 10, "limit": 10} + s = ContentFileVectorSearchRequestSerializer(data=data) + assert s.is_valid(), s.errors + + # Invalid window: offset + limit > MAX_RESULT_WINDOW + data = {"offset": MAX_RESULT_WINDOW, "limit": 1} + s = ContentFileVectorSearchRequestSerializer(data=data) + assert not s.is_valid() + assert ( + f"offset + limit must not exceed {MAX_RESULT_WINDOW}" + in s.errors["non_field_errors"][0] + ) + + # Boundary case: offset + limit == MAX_RESULT_WINDOW + data = {"offset": MAX_RESULT_WINDOW - 10, "limit": 10} + s = ContentFileVectorSearchRequestSerializer(data=data) + assert s.is_valid(), s.errors diff --git a/vector_search/views.py b/vector_search/views.py index de7b4b5d73..f88ac8a1a9 100644 --- a/vector_search/views.py +++ b/vector_search/views.py @@ -216,18 +216,29 @@ async def _execute_group_search(self, client, search_params, params): async def _execute_scroll_search( # noqa: PLR0913 self, client, search_collection, search_filter, limit, offset, order_by ): - remaining_to_skip = offset - next_page_offset = None - search_result = [] - # Build common scroll kwargs scroll_kwargs = { "collection_name": search_collection, "scroll_filter": search_filter, "with_vectors": False, } + if order_by: + # Qdrant disables pagination (next_page_offset) when order_by + # is used. Fetch offset+limit results in one call and slice + # on the client side. scroll_kwargs["order_by"] = self._format_order_by(order_by) + scroll_res = await client.scroll( + **scroll_kwargs, + limit=offset + limit, + ) + page_points, _ = scroll_res + return page_points[offset : offset + limit] + + # Standard pagination loop for non-ordered scrolls + remaining_to_skip = offset + next_page_offset = None + search_result = [] while True: fetch_size = min(max(remaining_to_skip, limit), 1000) @@ -295,6 +306,13 @@ async def async_vector_search( # noqa: PLR0913 search_result = await self._execute_group_search( client, search_params, params ) + elif order_by: + # Qdrant's query_points does not support offset with + # OrderByQuery. Fetch offset+limit results and slice + # manually on the client side. + search_params["limit"] = offset + limit + result_obj = await client.query_points(**search_params) + search_result = result_obj.points[offset : offset + limit] else: search_params["offset"] = offset result_obj = await client.query_points(**search_params) diff --git a/vector_search/views_test.py b/vector_search/views_test.py index a166362b7e..9bf9b121d9 100644 --- a/vector_search/views_test.py +++ b/vector_search/views_test.py @@ -421,3 +421,101 @@ def test_vector_search_sortby_parameter(mocker, client, query_string, hybrid_sea assert "order_by" in call_kwargs assert call_kwargs["order_by"].key == "views" assert call_kwargs["order_by"].direction == models.Direction.DESC + + +def test_vector_search_sortby_pagination(mocker, client): + """Test that sortby with offset uses client-side slicing instead of Qdrant offset. + + Qdrant's query_points does not support offset with OrderByQuery, so + we must fetch offset+limit results and slice on the client side. + """ + + mock_qdrant = mocker.patch( + "qdrant_client.AsyncQdrantClient", return_value=mocker.AsyncMock() + )() + + # Create 80 mock points to simulate a large result set + mock_points = [] + for i in range(80): + mock_point = mocker.MagicMock() + mock_point.payload = {"readable_id": f"resource-{i}"} + mock_points.append(mock_point) + + mock_result = mocker.MagicMock() + mock_result.points = mock_points + mock_qdrant.query_points = mocker.AsyncMock(return_value=mock_result) + mock_qdrant.scroll = mocker.AsyncMock(return_value=([], None)) + mock_qdrant.count = mocker.AsyncMock(return_value=CountResult(count=100)) + mocker.patch( + "vector_search.views.async_qdrant_client", + return_value=mock_qdrant, + ) + + params = { + "q": "test", + "sortby": "-created_on", + "limit": 20, + "offset": 60, + } + + client.get( + reverse("vector_search:v0:vector_learning_resources_search"), data=params + ) + + call_kwargs = mock_qdrant.query_points.mock_calls[0].kwargs + + # Should request offset+limit results, not just limit + assert call_kwargs["limit"] == 80 # 60 + 20 + + # Should NOT pass offset to Qdrant when using OrderByQuery + assert "offset" not in call_kwargs + + +def test_vector_search_sortby_scroll_pagination(mocker, client): + """Test that sortby with offset on scroll (no query) uses client-side slicing. + + Qdrant disables scroll pagination (next_page_offset) when order_by + is used. We must fetch offset+limit in one call and slice. + """ + + mock_qdrant = mocker.patch( + "qdrant_client.AsyncQdrantClient", return_value=mocker.AsyncMock() + )() + + # Create 80 mock points to simulate a large result set + mock_points = [] + for i in range(80): + mock_point = mocker.MagicMock() + mock_point.payload = {"readable_id": f"resource-{i}"} + mock_points.append(mock_point) + + # scroll returns (points, next_page_offset=None) when order_by is used + mock_qdrant.scroll = mocker.AsyncMock(return_value=(mock_points, None)) + mock_qdrant.query_points = mocker.AsyncMock() + mock_qdrant.count = mocker.AsyncMock(return_value=CountResult(count=100)) + mocker.patch( + "vector_search.views.async_qdrant_client", + return_value=mock_qdrant, + ) + + params = { + "q": "", + "sortby": "-created_on", + "limit": 20, + "offset": 60, + } + + client.get( + reverse("vector_search:v0:vector_learning_resources_search"), data=params + ) + + call_kwargs = mock_qdrant.scroll.mock_calls[0].kwargs + + # Should request offset+limit in a single call + assert call_kwargs["limit"] == 80 # 60 + 20 + + # Should NOT pass a page offset (no cursor-based pagination with order_by) + assert "offset" not in call_kwargs or call_kwargs.get("offset") is None + + # query_points should not be called (no query string) + assert mock_qdrant.query_points.call_count == 0 From 159086c2e2fab71581450a8e6e7969c94335572f Mon Sep 17 00:00:00 2001 From: Anastasia Beglova Date: Thu, 30 Apr 2026 09:12:07 -0400 Subject: [PATCH 2/4] use youtube etl for playlists and ocw for videos (#3257) --- .../FeaturedVideo.tsx | 8 +- .../SeriesVideoList.tsx | 9 +- .../VideoPlaylistCollectionPage/VideoCard.tsx | 9 +- .../VideoDetailPage.tsx | 35 ++- .../VideoSeriesDetailPage.tsx | 15 +- .../videoSources.test.ts | 47 ++- .../videoSources.ts | 10 + .../LearningResourceExpanded/InfoSection.tsx | 8 +- learning_resources/etl/loaders.py | 102 ++++++- learning_resources/etl/loaders_test.py | 285 ++++++++++++++++-- learning_resources/etl/pipelines.py | 5 +- learning_resources/etl/youtube.py | 52 +++- learning_resources/etl/youtube_test.py | 61 +++- learning_resources/tasks.py | 10 +- learning_resources_search/api.py | 33 +- main/settings.py | 10 + main/settings_course_etl.py | 1 - 17 files changed, 599 insertions(+), 101 deletions(-) diff --git a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/FeaturedVideo.tsx b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/FeaturedVideo.tsx index fbfcf7259e..941e85b920 100644 --- a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/FeaturedVideo.tsx +++ b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/FeaturedVideo.tsx @@ -184,7 +184,9 @@ const FeaturedVideo: React.FC = ({ totalVideos, totalTime, }) => { - const imageUrl = video.image?.url ?? null + const imageUrl = + video.image?.url ?? video.content_files?.[0]?.image_src ?? null + const duration = video.video?.duration ? formatDurationClockTime(video.video.duration) : null @@ -231,7 +233,9 @@ const FeaturedVideo: React.FC = ({ {description && ( - {description} + )} ) : ( diff --git a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/SeriesVideoList.tsx b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/SeriesVideoList.tsx index 3c83f6f7ed..f13f8bea8a 100644 --- a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/SeriesVideoList.tsx +++ b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/SeriesVideoList.tsx @@ -227,9 +227,12 @@ export const EpisodeItem: React.FC = ({ {episode.title} - - {episode.description} - + diff --git a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoCard.tsx b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoCard.tsx index 386f04429b..8900d13d90 100644 --- a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoCard.tsx +++ b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoCard.tsx @@ -142,8 +142,11 @@ type VideoCardProps = { const VideoCard: React.FC = ({ resource, href }) => { const [imgError, setImgError] = useState(false) - const imageUrl = - !imgError && resource.image?.url ? resource.image.url : PLACEHOLDER_IMG + const imageUrl = !imgError + ? (resource?.image?.url ?? + resource.content_files?.[0]?.image_src ?? + PLACEHOLDER_IMG) + : PLACEHOLDER_IMG const description = resource.description ?? "" const duration = resource.video?.duration ? formatDurationClockTime(resource.video.duration) @@ -171,7 +174,7 @@ const VideoCard: React.FC = ({ resource, href }) => { - {description} + diff --git a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoDetailPage.tsx b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoDetailPage.tsx index e1c1918d8c..1eadd25e4b 100644 --- a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoDetailPage.tsx +++ b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoDetailPage.tsx @@ -388,7 +388,13 @@ const VideoDetailPage: React.FC = ({ const sources = useMemo( () => - video ? resolveVideoSources(video.video?.streaming_url, video.url) : [], + video + ? resolveVideoSources( + video.video?.streaming_url, + video.url, + video.content_files?.[0]?.youtube_id, + ) + : [], [video], ) @@ -523,6 +529,7 @@ const VideoDetailPage: React.FC = ({ sources[0]?.type === "video/youtube" ? undefined : (video?.video?.cover_image_url ?? + video?.content_files?.[0]?.image_src ?? video?.image?.url ?? undefined) } @@ -532,10 +539,12 @@ const VideoDetailPage: React.FC = ({ ariaLabel={`Video: ${videoTitleLabel}`} ariaDescribedBy="video-description" /> - ) : video?.image?.url ? ( + ) : video?.image?.url || video?.content_files?.[0]?.image_src ? ( {videoThumbnailAlt} = ({ {!isLoading && video?.description && ( - - {video?.description} - + )} {!isLoading && !video?.description && ( @@ -627,7 +637,10 @@ const VideoDetailPage: React.FC = ({ const itemDuration = item.video?.duration ? formatDurationClockTime(item.video.duration) : null - const imageUrl = item.image?.url ?? null + const imageUrl = + item.image?.url ?? + item.content_files?.[0]?.image_src ?? + null const itemTopicNames = (item.topics ?? []) .map((topic) => topic.name) .filter(Boolean) @@ -660,9 +673,11 @@ const VideoDetailPage: React.FC = ({ {item.title} {item.description && ( - - {item.description} - + )} diff --git a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoSeriesDetailPage.tsx b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoSeriesDetailPage.tsx index a9b0bf201a..a38787342a 100644 --- a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoSeriesDetailPage.tsx +++ b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/VideoSeriesDetailPage.tsx @@ -64,7 +64,13 @@ const VideoSeriesDetailPage: React.FC = ({ const sources = useMemo( () => - video ? resolveVideoSources(video.video?.streaming_url, video.url) : [], + video + ? resolveVideoSources( + video.video?.streaming_url, + video.url, + video.content_files?.[0]?.youtube_id, + ) + : [], [video], ) @@ -274,9 +280,10 @@ const VideoSeriesDetailPage: React.FC = ({ {/* Description */} {!isLoading && video?.description && ( - - {video.description} - + )} {!isLoading && !video?.description && ( diff --git a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/videoSources.test.ts b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/videoSources.test.ts index 53135fbfcc..4a4ffdbe8e 100644 --- a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/videoSources.test.ts +++ b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/videoSources.test.ts @@ -5,6 +5,7 @@ describe("resolveVideoSources", () => { const result = resolveVideoSources( "https://cdn.example.com/video/index.m3u8", null, + null, ) expect(result).toEqual([ { @@ -18,6 +19,7 @@ describe("resolveVideoSources", () => { const result = resolveVideoSources( "https://cdn.example.com/video/manifest.mpd", null, + null, ) expect(result).toEqual([ { @@ -31,6 +33,7 @@ describe("resolveVideoSources", () => { const result = resolveVideoSources( "https://cdn.example.com/video/clip.mp4", null, + null, ) expect(result).toEqual([ { src: "https://cdn.example.com/video/clip.mp4", type: "video/mp4" }, @@ -41,6 +44,7 @@ describe("resolveVideoSources", () => { const result = resolveVideoSources( "https://cdn.example.com/video/stream", "https://www.youtube.com/watch?v=abc", + null, ) // streamingUrl takes precedence over pageUrl expect(result).toEqual([ @@ -52,6 +56,7 @@ describe("resolveVideoSources", () => { const result = resolveVideoSources( null, "https://www.youtube.com/watch?v=abc123", + null, ) expect(result).toEqual([ { src: "https://www.youtube.com/watch?v=abc123", type: "video/youtube" }, @@ -59,23 +64,47 @@ describe("resolveVideoSources", () => { }) it("returns YouTube source when pageUrl is a youtu.be short link", () => { - const result = resolveVideoSources(null, "https://youtu.be/abc123") + const result = resolveVideoSources(null, "https://youtu.be/abc123", null) expect(result).toEqual([ { src: "https://youtu.be/abc123", type: "video/youtube" }, ]) }) - it("returns an empty array when both arguments are null", () => { - expect(resolveVideoSources(null, null)).toEqual([]) + it("returns YouTube source from youtubeId when no streaming URL", () => { + const result = resolveVideoSources(null, null, "dQw4w9WgXcQ") + expect(result).toEqual([ + { + src: "https://www.youtube.com/watch?v=dQw4w9WgXcQ", + type: "video/youtube", + }, + ]) }) - it("returns an empty array when both arguments are undefined", () => { - expect(resolveVideoSources(undefined, undefined)).toEqual([]) + it("prefers youtubeId over a non-YouTube pageUrl", () => { + const result = resolveVideoSources( + null, + "https://ocw.mit.edu/some-video", + "dQw4w9WgXcQ", + ) + expect(result).toEqual([ + { + src: "https://www.youtube.com/watch?v=dQw4w9WgXcQ", + type: "video/youtube", + }, + ]) }) - it("returns an empty array when there is no streaming URL and pageUrl is not a YouTube link", () => { - expect(resolveVideoSources(null, "https://ocw.mit.edu/some-video")).toEqual( - [], - ) + it("returns an empty array when all arguments are null", () => { + expect(resolveVideoSources(null, null, null)).toEqual([]) + }) + + it("returns an empty array when all arguments are undefined", () => { + expect(resolveVideoSources(undefined, undefined, undefined)).toEqual([]) + }) + + it("returns an empty array when there is no streaming URL, no youtubeId, and pageUrl is not a YouTube link", () => { + expect( + resolveVideoSources(null, "https://ocw.mit.edu/some-video", null), + ).toEqual([]) }) }) diff --git a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/videoSources.ts b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/videoSources.ts index 2ce85f6fbe..4bcd2da53f 100644 --- a/frontends/main/src/app-pages/VideoPlaylistCollectionPage/videoSources.ts +++ b/frontends/main/src/app-pages/VideoPlaylistCollectionPage/videoSources.ts @@ -10,6 +10,7 @@ import type { VideoJsSource } from "./VideoJsPlayer" export function resolveVideoSources( streamingUrl: string | null | undefined, pageUrl: string | null | undefined, + youtubeId: string | null | undefined, ): VideoJsSource[] { if (streamingUrl) { // HLS @@ -24,6 +25,15 @@ export function resolveVideoSources( return [{ src: streamingUrl, type: "video/mp4" }] } + if (youtubeId) { + return [ + { + src: `https://www.youtube.com/watch?v=${youtubeId}`, + type: "video/youtube", + }, + ] + } + if (pageUrl && /youtube\.com|youtu\.be/.test(pageUrl)) { return [{ src: pageUrl, type: "video/youtube" }] } diff --git a/frontends/main/src/page-components/LearningResourceExpanded/InfoSection.tsx b/frontends/main/src/page-components/LearningResourceExpanded/InfoSection.tsx index 86a44262fe..680ad1afe3 100644 --- a/frontends/main/src/page-components/LearningResourceExpanded/InfoSection.tsx +++ b/frontends/main/src/page-components/LearningResourceExpanded/InfoSection.tsx @@ -529,7 +529,13 @@ const INFO_ITEMS: InfoItemConfig = [ label: "Parent Course:", Icon: RiBookLine, selector: (resource: LearningResource) => { - return formattedParentCourseName(resource) + const name = formattedParentCourseName(resource) + if (!name || !resource.url) return name + return ( + + {name} + + ) }, }, { diff --git a/learning_resources/etl/loaders.py b/learning_resources/etl/loaders.py index 1fe7a9d4b1..631a2a5a26 100644 --- a/learning_resources/etl/loaders.py +++ b/learning_resources/etl/loaders.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from typing import NamedTuple +from django.conf import settings from django.contrib.auth import get_user_model from django.core.cache import caches from django.db import transaction @@ -11,6 +12,7 @@ from learning_resources.constants import ( CONTENT_TYPE_PAGE, + OCW_CONTENT_CATEGORY_LECTURE_VIDEOS, OCW_COURSE_CONTENT_CATEGORY_MAPPING, VIDEO_CONTENT_CATEGORIES, LearningResourceDelivery, @@ -1030,11 +1032,20 @@ def load_learning_materials( """ material_ids = [] + if settings.CREATE_OCW_LEARNING_MATERIALS: + promoted_ocw_file_types = set(OCW_COURSE_CONTENT_CATEGORY_MAPPING.keys()) + elif settings.SHOW_OCW_LECTURE_VIDEOS: + promoted_ocw_file_types = {OCW_CONTENT_CATEGORY_LECTURE_VIDEOS} + else: + promoted_ocw_file_types = set() + for content_file_id in content_file_ids: content_file = ContentFile.objects.get(id=content_file_id) - learning_material_tags = set( - content_file.content_tags.values_list("name", flat=True) - ) & set(OCW_COURSE_CONTENT_CATEGORY_MAPPING.keys()) + + learning_material_tags = ( + set(content_file.content_tags.values_list("name", flat=True)) + & promoted_ocw_file_types + ) if content_file.content_type != CONTENT_TYPE_PAGE and learning_material_tags: material_ids.append( load_learning_material(course_run, content_file, learning_material_tags) @@ -1330,6 +1341,7 @@ def load_video(video_data: dict) -> LearningResource: video_fields = video_data.pop("video", {}) image_data = video_data.pop("image", None) video_data["resource_category"] = LearningResourceType.video.value + video_data.pop("youtube_id", None) with transaction.atomic(): ( @@ -1443,7 +1455,7 @@ def load_documents( return document_resources -def load_ovs_playlist(playlist_data: dict) -> LearningResource: +def load_ovs_playlist(playlist_data: dict) -> LearningResource | None: """ Load a single OVS playlist (collection) and its videos into the database. @@ -1451,7 +1463,8 @@ def load_ovs_playlist(playlist_data: dict) -> LearningResource: playlist_data (dict): playlist data including nested videos list Returns: - LearningResource: the created or updated playlist resource + LearningResource | None: the created or updated playlist resource, + or None if not all videos could be matched """ channel, _ = VideoChannel.objects.get_or_create( channel_id="ovs", @@ -1519,7 +1532,9 @@ def load_ovs_playlists(playlists_data: iter) -> list[LearningResource]: return playlists -def load_playlist(video_channel: VideoChannel, playlist_data: dict) -> LearningResource: +def load_playlist( + video_channel: VideoChannel, playlist_data: dict +) -> LearningResource | None: """ Load a video playlist into the database @@ -1528,13 +1543,38 @@ def load_playlist(video_channel: VideoChannel, playlist_data: dict) -> LearningR playlist_data (dict): the video playlist Returns: - LearningResource: the created or updated playlist resource + LearningResource | None: the created or updated playlist resource, + or None if create_videos is False and not all videos could be matched """ playlist_id = playlist_data.pop("playlist_id") thumbnail_data = playlist_data.pop("image", None) videos_data = playlist_data.pop("videos", []) offered_bys_data = playlist_data.pop("offered_by", None) + create_videos = playlist_data.pop("create_videos", True) + + if create_videos: + video_resources = load_videos(videos_data) + else: + video_resources = find_existing_videos(videos_data) + + if video_resources is None: + log.info( + "Skipping playlist %s: not all videos have matching video. ", + playlist_id, + ) + + existing_resource = LearningResource.objects.filter( + readable_id=playlist_id, + resource_type=LearningResourceType.video_playlist.name, + published=True, + ).first() + if existing_resource: + existing_resource.published = False + existing_resource.save() + update_index(existing_resource, newly_created=False) + return None + playlist_data["resource_category"] = LearningResourceType.video_playlist.value with transaction.atomic(): if thumbnail_data: @@ -1557,10 +1597,13 @@ def load_playlist(video_channel: VideoChannel, playlist_data: dict) -> LearningR ) load_offered_by(playlist_resource, offered_bys_data) - video_resources = load_videos(videos_data) load_topics(playlist_resource, most_common_topics(video_resources)) + + # Unpublish any videos from youtube that are no longer in the playlist + # OCW content file videos are removed from the playlist but not unpublished unpublished_videos = playlist_resource.resources.filter( resource_type=LearningResourceType.video.name, + platform=PlatformType.youtube.name, published=True, ).exclude(id__in=[video.id for video in video_resources]) unpublished_video_ids = list(unpublished_videos.values_list("id", flat=True)) @@ -1584,6 +1627,42 @@ def load_playlist(video_channel: VideoChannel, playlist_data: dict) -> LearningR return playlist_resource +def find_existing_videos(videos_data: list[dict]) -> list[LearningResource] | None: + """ + Find existing video resources matching the given video data. + + Args: + videos_data (list of dict): list of video data dicts + + Returns: + list of LearningResource | None: list of existing video resources, + or None if any video is not found + """ + videos = [] + for video_data in videos_data: + youtube_id = video_data.get("youtube_id") + if not youtube_id: + return None + learning_resource = ( + ContentFile.objects.filter( + youtube_id=youtube_id, + direct_learning_resource__resource_type=LearningResourceType.video.name, + direct_learning_resource__published=True, + ) + .select_related("direct_learning_resource") + .first() + ) + if learning_resource: + videos.append(learning_resource.direct_learning_resource) + else: + log.info( + "No existing published video resource found for youtube_id=%s", + youtube_id, + ) + return None + return videos + + def load_playlists( video_channel: VideoChannel, playlists_data: iter ) -> list[LearningResource]: @@ -1599,7 +1678,12 @@ def load_playlists( the created or updated LearningResources for the playlists """ playlists = [ - load_playlist(video_channel, playlist_data) for playlist_data in playlists_data + playlist + for playlist in ( + load_playlist(video_channel, playlist_data) + for playlist_data in playlists_data + ) + if playlist is not None ] playlist_ids = [playlist.id for playlist in playlists] diff --git a/learning_resources/etl/loaders_test.py b/learning_resources/etl/loaders_test.py index ebbc30d4f9..c32198d4b4 100644 --- a/learning_resources/etl/loaders_test.py +++ b/learning_resources/etl/loaders_test.py @@ -34,6 +34,7 @@ from learning_resources.etl.loaders import ( ProgramLoadResult, calculate_completeness, + find_existing_videos, load_content_file, load_content_files, load_course, @@ -2094,6 +2095,7 @@ def test_load_video(mocker, video_exists, is_published, pass_topics): "offered_by": {"code": offered_by.code}, "published": is_published, "video": {"duration": video_resource.video.duration}, + "youtube_id": "youtube123", } if pass_topics: props["topics"] = expected_topics @@ -2212,6 +2214,200 @@ def test_load_playlist(mocker, playlist_exists, mock_get_similar_topics_qdrant): ) +def test_load_playlist_removed_videos_unpublished( + mocker, mock_get_similar_topics_qdrant +): + """Test that load_playlist unpublishes removed YouTube videos but not OCW videos""" + expected_topics = [{"name": "Biology"}] + LearningResourceTopicFactory.create(name="Biology") + mocker.patch( + "learning_resources.etl.loaders.most_common_topics", + return_value=expected_topics, + ) + mock_bulk_unpublish = mocker.patch( + "learning_resources.etl.loaders.bulk_resources_unpublished_actions", + ) + + channel = VideoChannelFactory.create() + ocw_platform = LearningResourcePlatformFactory.create(code=PlatformType.ocw.name) + + # Create an existing playlist with both a YouTube and OCW video + playlist = VideoPlaylistFactory.create(channel=channel).learning_resource + + youtube_video = VideoFactory.create().learning_resource + ocw_video = LearningResourceFactory.create( + resource_type=LearningResourceType.video.name, + platform=ocw_platform, + published=True, + ) + + playlist.resources.add( + youtube_video, + through_defaults={ + "relation_type": LearningResourceRelationTypes.PLAYLIST_VIDEOS, + "position": 0, + }, + ) + playlist.resources.add( + ocw_video, + through_defaults={ + "relation_type": LearningResourceRelationTypes.PLAYLIST_VIDEOS, + "position": 1, + }, + ) + + # Load playlist with new videos (excluding the old youtube and ocw videos) + new_video_resources = [ + video.learning_resource for video in VideoFactory.build_batch(2) + ] + videos_data = [ + { + **model_to_dict(video, exclude=non_transformable_attributes), + "platform": PlatformType.youtube.name, + "offered_by": {"code": LearningResourceOfferorFactory.create().code}, + } + for video in new_video_resources + ] + + props = { + **model_to_dict(playlist, exclude=["topics", *non_transformable_attributes]), + "platform": PlatformType.youtube.name, + "offered_by": {"code": LearningResourceOfferorFactory.create().code}, + "playlist_id": playlist.readable_id, + "url": f"https://youtube.com/playlist?list={playlist.readable_id}", + "image": { + "url": f"https://i.ytimg.com/vi/{playlist.readable_id}/hqdefault.jpg", + "alt": playlist.title, + }, + "videos": videos_data, + } + + load_playlist(channel, props) + + # YouTube video should be unpublished + youtube_video.refresh_from_db() + assert youtube_video.published is False + + # OCW video should NOT be unpublished + ocw_video.refresh_from_db() + assert ocw_video.published is True + + # bulk_resources_unpublished_actions called only with the youtube video + mock_bulk_unpublish.assert_called_once_with( + [youtube_video.id], + LearningResourceType.video.name, + ) + + +@pytest.mark.parametrize("all_videos_exist", [True, False]) +def test_find_existing_videos(all_videos_exist): + """Test that find_existing_videos returns matching video resources or None""" + video_resource_1 = VideoFactory.create().learning_resource + video_resource_2 = VideoFactory.create().learning_resource + run = LearningResourceRunFactory.create() + + # Create content files with youtube_id linked to video resources + ContentFileFactory.create( + run=run, + youtube_id="yt_id_1", + direct_learning_resource=video_resource_1, + ) + ContentFileFactory.create( + run=run, + youtube_id="yt_id_2", + direct_learning_resource=video_resource_2, + ) + + if all_videos_exist: + videos_data = [ + {"youtube_id": "yt_id_1"}, + {"youtube_id": "yt_id_2"}, + ] + result = find_existing_videos(videos_data) + assert result is not None + assert len(result) == 2 + assert result[0].id == video_resource_1.id + assert result[1].id == video_resource_2.id + else: + videos_data = [ + {"youtube_id": "yt_id_1"}, + {"youtube_id": "missing_yt_id"}, + ] + result = find_existing_videos(videos_data) + assert result is None + + +def test_find_existing_videos_empty_list(): + """Test that find_existing_videos returns an empty list for empty input""" + result = find_existing_videos([]) + assert result == [] + + +@pytest.mark.parametrize("all_videos_exist", [True, False]) +def test_load_playlist_create_videos_false( + mocker, all_videos_exist, mock_get_similar_topics_qdrant +): + """Test load_playlist with create_videos=False uses find_existing_videos""" + expected_topics = [{"name": "Biology"}, {"name": "Physics"}] + [ + LearningResourceTopicFactory.create(name=topic["name"]) + for topic in expected_topics + ] + mocker.patch( + "learning_resources.etl.loaders.most_common_topics", + return_value=expected_topics, + ) + mocker.patch( + "learning_resources.etl.loaders.bulk_resources_unpublished_actions", + ) + mock_update_index = mocker.patch( + "learning_resources.etl.loaders.update_index", + ) + + channel = VideoChannelFactory.create() + run = LearningResourceRunFactory.create() + + if all_videos_exist: + video_resource = VideoFactory.create().learning_resource + ContentFileFactory.create( + run=run, + youtube_id="yt_existing", + direct_learning_resource=video_resource, + ) + videos_data = [{"youtube_id": "yt_existing"}] + else: + videos_data = [{"youtube_id": "yt_missing"}] + + playlist = VideoPlaylistFactory.build().learning_resource + props = { + **model_to_dict( + playlist, + exclude=["topics", "offered_by", *non_transformable_attributes], + ), + "platform": PlatformType.youtube.name, + "playlist_id": playlist.readable_id, + "url": f"https://youtube.com/playlist?list={playlist.readable_id}", + "image": { + "url": f"https://i.ytimg.com/vi/{playlist.readable_id}/hqdefault.jpg", + "alt": playlist.title, + }, + "videos": videos_data, + "create_videos": False, + } + + result = load_playlist(channel, props) + + if all_videos_exist: + assert isinstance(result, LearningResource) + assert result.resources.count() == 1 + assert result.video_playlist.channel == channel + else: + assert result is None + # update_index should not be called for the playlist + # (only possibly for unpublishing an existing one) + mock_update_index.assert_not_called() + + def test_load_playlists_unpublish(mocker): """Test load_playlists when a video/playlist gets unpublished""" mocker.patch("learning_resources_search.tasks.bulk_deindex_learning_resources.si") @@ -2985,12 +3181,25 @@ def test_load_documents(mocker, climate_platform, mock_get_similar_topics_qdrant @pytest.mark.django_db -def test_load_learning_materials(mocker): +@pytest.mark.parametrize( + ("create_ocw_learning_materials", "show_ocw_lecture_videos"), + [ + (True, False), + (False, True), + (False, False), + ], +) +def test_load_learning_materials( + mocker, settings, create_ocw_learning_materials, show_ocw_lecture_videos +): """ Test that load_learning_materials runs load_learning_material - if a ocw content file has content_tags in OCW_COURSE_CONTENT_CATEGORY_MAPPING + based on CREATE_OCW_LEARNING_MATERIALS and SHOW_OCW_LECTURE_VIDEOS settings """ + settings.CREATE_OCW_LEARNING_MATERIALS = create_ocw_learning_materials + settings.SHOW_OCW_LECTURE_VIDEOS = show_ocw_lecture_videos + ocw = LearningResourcePlatformFactory.create(code=PlatformType.ocw.name) ocw_course = CourseFactory.create( platform=ocw.code, @@ -3000,6 +3209,7 @@ def test_load_learning_materials(mocker): relevant_content_tag = LearningResourceContentTagFactory.create( name="Programming Assignments" ) + lecture_video_tag = LearningResourceContentTagFactory.create(name="Lecture Videos") irrelevant_content_tag = LearningResourceContentTagFactory.create(name="Syllabus") no_longer_relevant_resource = LearningResourceFactory.create() @@ -3010,6 +3220,12 @@ def test_load_learning_materials(mocker): content_type=CONTENT_TYPE_FILE, ) + lecture_video_content_file = ContentFileFactory.create( + run=ocw_course.learning_resource.runs.first(), + content_tags=[lecture_video_tag], + content_type=CONTENT_TYPE_FILE, + ) + other_content_file = ContentFileFactory.create( run=ocw_course.learning_resource.runs.first(), content_tags=[irrelevant_content_tag], @@ -3025,32 +3241,53 @@ def test_load_learning_materials(mocker): course_run=ocw_course.learning_resource.runs.first(), content_file_ids=[ learning_material_content_file.id, + lecture_video_content_file.id, other_content_file.id, ], ) - assert load_learning_materials_spy.call_count == 1 - load_learning_materials_spy.assert_called_with( - ocw_course.learning_resource.runs.first(), - learning_material_content_file, - {"Programming Assignments"}, - ) - - learning_material_content_file.refresh_from_db() - - learning_material = learning_material_content_file.direct_learning_resource - - resource_relationships = ocw_course.learning_resource.children.all() - - assert resource_relationships.count() == 1 - - assert resource_relationships.first().child.id == learning_material.id - assert ( - resource_relationships.first().relation_type - == LearningResourceRelationTypes.COURSE_LEARNING_MATERIALS.value - ) - - assert mock_index.call_count == 2 + if create_ocw_learning_materials: + # Both programming assignments and lecture videos are promoted + assert load_learning_materials_spy.call_count == 2 + load_learning_materials_spy.assert_any_call( + ocw_course.learning_resource.runs.first(), + learning_material_content_file, + {"Programming Assignments"}, + ) + load_learning_materials_spy.assert_any_call( + ocw_course.learning_resource.runs.first(), + lecture_video_content_file, + {"Lecture Videos"}, + ) + resource_relationships = ocw_course.learning_resource.children.all() + assert resource_relationships.count() == 2 + # 2 load_learning_material calls (each calls update_index) + # + 1 for unpublishing no_longer_relevant_resource + assert mock_index.call_count == 3 + elif show_ocw_lecture_videos: + # Only lecture videos are promoted + assert load_learning_materials_spy.call_count == 1 + load_learning_materials_spy.assert_called_with( + ocw_course.learning_resource.runs.first(), + lecture_video_content_file, + {"Lecture Videos"}, + ) + resource_relationships = ocw_course.learning_resource.children.all() + assert resource_relationships.count() == 1 + lecture_video_content_file.refresh_from_db() + assert ( + resource_relationships.first().child.id + == lecture_video_content_file.direct_learning_resource.id + ) + # 1 load_learning_material call + 1 for unpublishing no_longer_relevant_resource + assert mock_index.call_count == 2 + else: + # Nothing is promoted + assert load_learning_materials_spy.call_count == 0 + resource_relationships = ocw_course.learning_resource.children.all() + assert resource_relationships.count() == 0 + # 1 for unpublishing no_longer_relevant_resource + assert mock_index.call_count == 1 no_longer_relevant_resource.refresh_from_db() assert no_longer_relevant_resource.published is False diff --git a/learning_resources/etl/pipelines.py b/learning_resources/etl/pipelines.py index 78cadb436a..5028ca72e7 100644 --- a/learning_resources/etl/pipelines.py +++ b/learning_resources/etl/pipelines.py @@ -160,7 +160,10 @@ def ocw_courses_etl( calc_completeness=True, ) - if settings.CREATE_OCW_LEARNING_MATERIALS: + if ( + settings.CREATE_OCW_LEARNING_MATERIALS + or settings.SHOW_OCW_LECTURE_VIDEOS + ): loaders.load_learning_materials(course_run, content_file_ids) else: log.info("No course data found for %s", url_path) diff --git a/learning_resources/etl/youtube.py b/learning_resources/etl/youtube.py index 755655fbe4..d98b49d8ea 100644 --- a/learning_resources/etl/youtube.py +++ b/learning_resources/etl/youtube.py @@ -192,7 +192,11 @@ def extract_playlist_items( def _extract_playlists( - youtube_client: Resource, request: BatchHttpRequest, playlist_configs: dict + youtube_client: Resource, + request: BatchHttpRequest, + playlist_configs: dict, + *, + create_videos_channel_setting: bool, ) -> Generator[tuple, None, None]: """ Extract a list of playlists @@ -201,6 +205,8 @@ def _extract_playlists( youtube_client (Resource): Youtube api client request (BatchHttpRequest): Youtube api BatchHttpRequest object playlist_configs (dict): dict of playlist configurations + create_videos_channel_setting (bool): whether the channel config + is to create videos from youtube data Returns: A generator that yields playlist data @@ -216,15 +222,20 @@ def _extract_playlists( playlist_id = playlist_data["id"] if playlist_id in playlist_configs: playlist_config = playlist_configs[playlist_id] + create_videos = playlist_config.get( + "create_videos", create_videos_channel_setting + ) else: playlist_config = playlist_configs.get( WILDCARD_PLAYLIST_ID, {"ignore": True} ) + create_videos = create_videos_channel_setting if not playlist_config.get("ignore", False): yield ( playlist_data, extract_playlist_items(youtube_client, playlist_id), + create_videos, ) request = youtube_client.playlists().list_next(request, response) @@ -237,7 +248,11 @@ def _extract_playlists( def extract_playlists( - youtube_client: Resource, playlist_configs: list[dict], channel_id: str + youtube_client: Resource, + playlist_configs: list[dict], + channel_id: str, + *, + create_videos_channel_setting: bool, ) -> Generator[tuple, None, None]: """ Extract a list of playlists for a channel @@ -245,6 +260,8 @@ def extract_playlists( youtube_client (object): Youtube api client playlist_configs (list of dict): list of playlist configurations channel_id (str): youtube's id for the channel + create_videos_channel_setting (bool): whether the channel config + is to create videos from youtube data Returns: A generator that yields playlist data """ @@ -272,7 +289,12 @@ def extract_playlists( ) for request in requests: - yield from _extract_playlists(youtube_client, request, playlist_configs_by_id) + yield from _extract_playlists( + youtube_client, + request, + playlist_configs_by_id, + create_videos_channel_setting=create_videos_channel_setting, + ) def extract_channels( @@ -311,11 +333,13 @@ def extract_channels( channel_id = channel_data["id"] channel_config = channel_configs_by_ids[channel_id] offered_by = channel_config.get("offered_by", None) + create_videos = channel_config.get("create_videos", True) playlist_configs = channel_config.get("playlists", []) - - # if we hit any error on a playlist, we simply abort playlists = extract_playlists( - youtube_client, playlist_configs, channel_id + youtube_client, + playlist_configs, + channel_id, + create_videos_channel_setting=create_videos, ) yield (offered_by, channel_data, playlists) @@ -481,11 +505,16 @@ def transform_video(video_data: dict, offered_by_code: str) -> dict: "duration": video_data["contentDetails"]["duration"], }, "availability": Availability.anytime.name, + "youtube_id": video_data["id"], } def transform_playlist( - playlist_data: dict, videos: Generator[dict, None, None], offered_by_code: str + playlist_data: dict, + videos: Generator[dict, None, None], + offered_by_code: str, + *, + create_videos: bool, ) -> dict: """ Transform a playlist into our normalized data @@ -494,6 +523,8 @@ def transform_playlist( playlist_data (dict): the extracted playlist data videos (generator): generator for data for the playlist's videos offered_by_code (str): the offered_by code for this playlist + create_videos (bool): whether to create videos from this playlist + or match to existing videos without creating new ones Returns: dict: normalized playlist data """ @@ -515,6 +546,7 @@ def transform_playlist( "alt": playlist_data["snippet"]["title"], }, "availability": Availability.anytime.name, + "create_videos": create_videos, } @@ -539,8 +571,10 @@ def transform(extracted_channels: iter) -> Generator[dict, None, None]: "published": True, # intentional generator expression "playlists": ( - transform_playlist(playlist, videos, offered_by) - for playlist, videos in playlists + transform_playlist( + playlist, videos, offered_by, create_videos=create_videos + ) + for playlist, videos, create_videos in playlists ), } diff --git a/learning_resources/etl/youtube_test.py b/learning_resources/etl/youtube_test.py index ad3300351e..44d30f53ad 100644 --- a/learning_resources/etl/youtube_test.py +++ b/learning_resources/etl/youtube_test.py @@ -176,18 +176,18 @@ def extracted_and_transformed_values(youtube_api_responses): ( OfferedBy.ocw.name, channels_list[0]["items"][0], - [(playlists_list[0]["items"][0], ocw_videos)], + [(playlists_list[0]["items"][0], ocw_videos, True)], ), ( OfferedBy.mitx.name, channels_list[0]["items"][1], - [(playlists_list[1]["items"][0], mitx_videos)], + [(playlists_list[1]["items"][0], mitx_videos, True)], ), ( "csail", channels_list[0]["items"][2], [ - (playlists_list[2]["items"][1], csail_videos), + (playlists_list[2]["items"][1], csail_videos, True), ], ), ] @@ -236,11 +236,13 @@ def extracted_and_transformed_values(youtube_api_responses): "video": { "duration": video["contentDetails"]["duration"], }, + "youtube_id": video["id"], } for video in videos ], + "create_videos": create_videos, } - for playlist, videos in playlists + for playlist, videos, create_videos in playlists ], } for offered_by, channel, playlists in extracted @@ -263,8 +265,8 @@ def _resolve_extracted_channels(channels): def _resolve_extracted_playlist(playlist): """Resolve a playlist and its nested generators""" - playlist_data, videos = playlist - return (playlist_data, list(videos)) + playlist_data, videos, create_videos = playlist + return (playlist_data, list(videos), create_videos) @pytest.fixture @@ -423,10 +425,54 @@ def test_extract_playlists_errors( request = Mock(execute=Mock(side_effect=error(Mock(), b""))) if raised_exception: with pytest.raises(raised_exception) as err: - list(youtube._extract_playlists(mock_youtube_client, request, {})) # noqa: SLF001 + list( + youtube._extract_playlists( # noqa: SLF001 + mock_youtube_client, request, {}, create_videos_channel_setting=True + ) + ) assert message in str(err) +@pytest.mark.parametrize( + ("channel_create_videos", "playlist_create_videos", "expected"), + [ + (True, None, True), + (False, None, False), + (True, False, False), + (False, True, True), + ], +) +def test_extract_playlists_create_videos( + mock_youtube_client, + channel_create_videos, + playlist_create_videos, + expected, +): + """Test that create_videos is resolved from playlist config or channel setting""" + playlist_id = "PL_test_123" + playlist_data = {"id": playlist_id, "snippet": {"title": "Test"}} + request = Mock(execute=Mock(return_value={"items": [playlist_data]})) + mock_youtube_client.return_value.playlists.return_value.list_next.return_value = ( + None + ) + + playlist_config = {"id": playlist_id} + if playlist_create_videos is not None: + playlist_config["create_videos"] = playlist_create_videos + + results = list( + youtube._extract_playlists( # noqa: SLF001 + mock_youtube_client.return_value, + request, + {playlist_id: playlist_config}, + create_videos_channel_setting=channel_create_videos, + ) + ) + assert len(results) == 1 + _, _, create_videos = results[0] + assert create_videos is expected + + @pytest.mark.django_db @pytest.mark.parametrize( ("error", "raised_exception", "message"), @@ -462,6 +508,7 @@ def test_transform_playlist( extracted[0][2][0][0], extracted[0][2][0][1], OfferedBy.ocw.name, + create_videos=True, ) assert {**result, "videos": list(result["videos"])} == { **transformed[0]["playlists"][0], diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index f11361390e..3063ad5bf4 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -402,10 +402,14 @@ def update_ocw_learning_material_resources(self): # noqa: ARG001 ocw courses or content files """ - if not settings.CREATE_OCW_LEARNING_MATERIALS: + if ( + not settings.CREATE_OCW_LEARNING_MATERIALS + and not settings.SHOW_OCW_LECTURE_VIDEOS + ): message = ( - "CREATE_OCW_LEARNING_MATERIALS flag is set to False." - " update_ocw_learning_material_resources cannont run" + "update_ocw_learning_material_resources cannot run because " + "CREATE_OCW_LEARNING_MATERIALS and SHOW_OCW_LECTURE_VIDEOS " + "flags are set to False." ) raise RuntimeError(message) diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py index d5e10ce417..100099f532 100644 --- a/learning_resources_search/api.py +++ b/learning_resources_search/api.py @@ -480,23 +480,26 @@ def generate_filter_clauses(search_params): if search_params.get("endpoint") == LEARNING_RESOURCE and not search_params.get( "show_ocw_files" ): + ocw_should = [ + { + "bool": { + "must_not": [ + { + "nested": { + "path": "platform", + "query": {"term": {"platform.code": "ocw"}}, + } + } + ] + } + }, + {"term": {"resource_type": "course"}}, + ] + if settings.SHOW_OCW_LECTURE_VIDEOS: + ocw_should.append({"term": {"resource_category": "Lecture Videos"}}) ocw_clause = { "bool": { - "should": [ - { - "bool": { - "must_not": [ - { - "nested": { - "path": "platform", - "query": {"term": {"platform.code": "ocw"}}, - } - } - ] - } - }, - {"term": {"resource_type": "course"}}, - ], + "should": ocw_should, "minimum_should_match": 1, } } diff --git a/main/settings.py b/main/settings.py index 2d36ae984d..8359b06ede 100644 --- a/main/settings.py +++ b/main/settings.py @@ -934,3 +934,13 @@ def get_all_config_keys(): # Hubspot settings MITOL_HUBSPOT_API_PRIVATE_TOKEN = get_string("MITOL_HUBSPOT_API_PRIVATE_TOKEN", None) + +# Create all learning material resources for OCW courses +# Learning material resources are behind show_ocw_files flag in search +CREATE_OCW_LEARNING_MATERIALS = get_bool("CREATE_OCW_LEARNING_MATERIALS", default=False) +# Create just lecture video resources for OCW courses. +# OCW lecture video resources will be displayed in search +# regardless of show_ocw_files flag +# If the video config is set to `create_videos`=False ocw videos will be added to +# playlists +SHOW_OCW_LECTURE_VIDEOS = get_bool("SHOW_OCW_LECTURE_VIDEOS", default=False) diff --git a/main/settings_course_etl.py b/main/settings_course_etl.py index 0e19ce3f93..96ffcfa36a 100644 --- a/main/settings_course_etl.py +++ b/main/settings_course_etl.py @@ -145,4 +145,3 @@ "CONTENT_BASE_URL_OLL", "https://openlearninglibrary.mit.edu" ) CONTENT_BASE_URL_EDX = get_string("CONTENT_BASE_URL_EDX", "https://courses.edx.org") -CREATE_OCW_LEARNING_MATERIALS = get_bool("CREATE_OCW_LEARNING_MATERIALS", default=False) From 3332a4994937f9fce5846bc50d753bfb8f4f65ab Mon Sep 17 00:00:00 2001 From: Anastasia Beglova Date: Thu, 30 Apr 2026 09:12:54 -0400 Subject: [PATCH 3/4] show resource_category in channel pages (#3264) --- .../ChannelPage/ChannelSearch.test.tsx | 59 +++++++++++++++++++ .../app-pages/ChannelPage/searchRequests.ts | 3 + 2 files changed, 62 insertions(+) diff --git a/frontends/main/src/app-pages/ChannelPage/ChannelSearch.test.tsx b/frontends/main/src/app-pages/ChannelPage/ChannelSearch.test.tsx index 8f6b4317fd..e4e473cae1 100644 --- a/frontends/main/src/app-pages/ChannelPage/ChannelSearch.test.tsx +++ b/frontends/main/src/app-pages/ChannelPage/ChannelSearch.test.tsx @@ -310,4 +310,63 @@ describe("ChannelSearch", () => { await user.click(screen.getByRole("button", { name: "Search" })) expect(location.current.searchParams.get("q")).toBe("woof") }) + + test.each([ + { channelType: ChannelTypeEnum.Topic }, + { channelType: ChannelTypeEnum.Department }, + { channelType: ChannelTypeEnum.Unit }, + ])( + "Shows Resource Category facet only when resource_type_group=learning_material ($channelType)", + async ({ channelType }) => { + const { channel } = setMockApiResponses({ + channelPatch: { channel_type: channelType }, + search: { + count: 700, + metadata: { + aggregations: { + resource_type_group: [ + { key: "course", doc_count: 100 }, + { key: "learning_material", doc_count: 200 }, + ], + resource_category: [ + { key: "Course", doc_count: 100 }, + { key: "Video", doc_count: 100 }, + ], + topic: [{ key: "physics", doc_count: 100 }], + department: [{ key: "1", doc_count: 100 }], + certification_type: [{ key: "micromasters", doc_count: 100 }], + delivery: [{ key: "online", doc_count: 100 }], + offered_by: [{ key: "ocw", doc_count: 100 }], + }, + suggestions: [], + }, + }, + }) + + const { + view: { unmount }, + } = renderWithProviders(, { + url: `/c/${channel.channel_type}/${channel.name}/`, + }) + + const facetsContainer = await screen.findByTestId("facets-container") + await within(facetsContainer).findByText("Certificate") + expect( + within(facetsContainer).queryByText("Resource Category"), + ).toBeNull() + + unmount() + + // Re-render with resource_type_group=learning_material + renderWithProviders(, { + url: `/c/${channel.channel_type}/${channel.name}/?resource_type_group=learning_material`, + }) + + expect( + await within(await screen.findByTestId("facets-container")).findByText( + "Resource Category", + ), + ).toBeInTheDocument() + }, + ) }) diff --git a/frontends/main/src/app-pages/ChannelPage/searchRequests.ts b/frontends/main/src/app-pages/ChannelPage/searchRequests.ts index 7deb66d0d4..66b4b45d59 100644 --- a/frontends/main/src/app-pages/ChannelPage/searchRequests.ts +++ b/frontends/main/src/app-pages/ChannelPage/searchRequests.ts @@ -30,6 +30,7 @@ export const getConstantSearchParams = (searchFilter?: string) => { const FACETS_BY_CHANNEL_TYPE: Record = { [ChannelTypeEnum.Topic]: [ "free", + "resource_category", "resource_type", "certification_type", "delivery", @@ -38,6 +39,7 @@ const FACETS_BY_CHANNEL_TYPE: Record = { ], [ChannelTypeEnum.Department]: [ "free", + "resource_category", "resource_type", "certification_type", "topic", @@ -46,6 +48,7 @@ const FACETS_BY_CHANNEL_TYPE: Record = { ], [ChannelTypeEnum.Unit]: [ "free", + "resource_category", "resource_type", "topic", "certification_type", From f126e3f63b9a593d81bc6d85785890883dc5da4c Mon Sep 17 00:00:00 2001 From: Doof Date: Thu, 30 Apr 2026 13:29:45 +0000 Subject: [PATCH 4/4] Release 0.65.3 --- RELEASE.rst | 7 +++++++ main/settings.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index e28606c012..429c991afc 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -1,6 +1,13 @@ Release Notes ============= +Version 0.65.3 +-------------- + +- show resource_category in channel pages (#3264) +- use youtube etl for playlists and ocw for videos (#3257) +- Vector search pagination fix for empty hybrid searches (#3251) + Version 0.65.2 (Released April 30, 2026) -------------- diff --git a/main/settings.py b/main/settings.py index ba8db9989d..669435030c 100644 --- a/main/settings.py +++ b/main/settings.py @@ -35,7 +35,7 @@ from main.settings_pluggy import * # noqa: F403 from openapi.settings_spectacular import open_spectacular_settings -VERSION = "0.65.2" +VERSION = "0.65.3" log = logging.getLogger()