diff --git a/cppa_youtube_script_tracker/tests/test_failure_classification.py b/cppa_youtube_script_tracker/tests/test_failure_classification.py new file mode 100644 index 00000000..981430b2 --- /dev/null +++ b/cppa_youtube_script_tracker/tests/test_failure_classification.py @@ -0,0 +1,44 @@ +"""How core.errors.classify_failure treats exceptions from the YouTube stack. + +googleapiclient.errors.HttpError is not specially classified today; it falls +through to UNKNOWN unless it subclasses a handled type. +""" + +import pytest + +from core.errors import CollectorFailureCategory, classify_failure +from cppa_youtube_script_tracker.fetcher import QuotaExceededError + + +class _FakeResp: + def __init__(self, status: int, reason: str = "OK"): + self.status = status + self.reason = reason + + +def test_classify_failure_value_error_is_validation(): + assert classify_failure(ValueError("YOUTUBE_API_KEY is not set")) is ( + CollectorFailureCategory.VALIDATION + ) + + +def test_classify_failure_import_error_is_unknown(): + assert ( + classify_failure(ImportError("no module")) is CollectorFailureCategory.UNKNOWN + ) + + +def test_classify_failure_quota_exceeded_error_is_unknown(): + assert ( + classify_failure(QuotaExceededError("quota")) + is CollectorFailureCategory.UNKNOWN + ) + + +def test_classify_failure_google_http_error_is_unknown_if_client_installed(): + try: + from googleapiclient.errors import HttpError + except ImportError: + pytest.skip("google-api-python-client not installed") + err = HttpError(resp=_FakeResp(403, "Forbidden"), content=b"{}") + assert classify_failure(err) is CollectorFailureCategory.UNKNOWN diff --git a/cppa_youtube_script_tracker/tests/test_fetcher.py b/cppa_youtube_script_tracker/tests/test_fetcher.py index 84d55db8..7f7d2829 100644 --- a/cppa_youtube_script_tracker/tests/test_fetcher.py +++ b/cppa_youtube_script_tracker/tests/test_fetcher.py @@ -266,3 +266,168 @@ def fake_import(name, globals_=None, locals_=None, fromlist=(), level=0): datetime(2024, 1, 1, tzinfo=timezone.utc), datetime(2024, 2, 1, tzinfo=timezone.utc), ) + + +def test_build_queries_dedupe_duplicate_channel_terms(): + """Duplicate terms in _CHANNEL_FOCUSED_TERMS collapse via _dedupe_pairs (continue branch).""" + with patch.object(fetcher_mod, "_CHANNEL_FOCUSED_TERMS", ["C++", "C++"]): + pairs = fetcher_mod._build_queries("CppCon") + assert len(pairs) == 1 + assert pairs[0][0] == "C++" + + +def test_fetch_search_page_passes_page_token_and_channel_id(): + youtube = MagicMock() + youtube.search.return_value.list.return_value.execute = MagicMock( + return_value={"items": [], "nextPageToken": None} + ) + fetcher_mod._fetch_search_page( + youtube, "q", "chan123", "after", "before", "nextTok" + ) + kwargs = youtube.search.return_value.list.call_args.kwargs + assert kwargs["pageToken"] == "nextTok" + assert kwargs["channelId"] == "chan123" + + +def test_format_video_data_missing_snippet_and_statistics(): + vd = {"id": "thin1", "snippet": {}, "statistics": {}, "contentDetails": {}} + out = fetcher_mod._format_video_data(vd, search_term="s") + assert out["video_id"] == "thin1" + assert out["title"] == "" + assert out["duration_seconds"] == 0 + assert out["view_count"] is None + + +@override_settings(YOUTUBE_API_KEY="k") +def test_process_one_channel_query_pagination_two_pages(): + youtube = MagicMock() + page = {"n": 0} + + def search_execute(): + page["n"] += 1 + if page["n"] == 1: + return { + "items": [ + {"id": {"kind": "youtube#video", "videoId": "pg1"}}, + ], + "nextPageToken": "t2", + } + return { + "items": [ + {"id": {"kind": "youtube#video", "videoId": "pg2"}}, + ], + "nextPageToken": None, + } + + search_mock = MagicMock(side_effect=search_execute) + youtube.search.return_value.list.return_value.execute = search_mock + + def video_execute(): + call = youtube.videos.return_value.list.call_args + ids = call.kwargs["id"].split(",") + items = [] + for vid in ids: + items.append( + { + "id": vid, + "snippet": {"title": vid}, + "statistics": {}, + "contentDetails": {"duration": "PT30S"}, + } + ) + return {"items": items} + + youtube.videos.return_value.list.return_value.execute = video_execute + + seen: set[str] = set() + with patch.object(fetcher_mod.time, "sleep"): + out = fetcher_mod._process_one_channel_query( + youtube, + "q", + None, + "a", + "b", + seen, + min_duration_seconds=0, + ) + assert len(out) == 2 + assert {row["video_id"] for row in out} == {"pg1", "pg2"} + assert search_mock.call_count == 2 + + +def test_process_one_channel_query_skips_detail_row_with_empty_id(): + youtube = MagicMock() + youtube.search.return_value.list.return_value.execute = MagicMock( + return_value={ + "items": [{"id": {"kind": "youtube#video", "videoId": "ghost"}}], + "nextPageToken": None, + } + ) + youtube.videos.return_value.list.return_value.execute = MagicMock( + return_value={ + "items": [ + { + "id": "", + "snippet": {}, + "statistics": {}, + "contentDetails": {"duration": "PT1M"}, + } + ] + } + ) + seen: set[str] = set() + with patch.object(fetcher_mod.time, "sleep"): + out = fetcher_mod._process_one_channel_query( + youtube, + "q", + None, + "a", + "b", + seen, + min_duration_seconds=0, + ) + assert out == [] + + +def test_process_one_channel_query_non_video_search_items_no_video_list_call(): + youtube = MagicMock() + youtube.search.return_value.list.return_value.execute = MagicMock( + return_value={ + "items": [ + {"id": {"kind": "youtube#playlist", "playlistId": "PL1"}}, + ], + "nextPageToken": None, + } + ) + vid_exec = MagicMock(return_value={"items": []}) + youtube.videos.return_value.list.return_value.execute = vid_exec + seen: set[str] = set() + with patch.object(fetcher_mod.time, "sleep"): + out = fetcher_mod._process_one_channel_query( + youtube, + "q", + None, + "a", + "b", + seen, + min_duration_seconds=0, + ) + assert out == [] + vid_exec.assert_not_called() + + +def test_process_one_channel_query_breaks_when_search_returns_none(): + youtube = MagicMock() + youtube.search.return_value.list.return_value.execute = MagicMock(return_value=None) + seen: set[str] = set() + with patch.object(fetcher_mod.time, "sleep"): + out = fetcher_mod._process_one_channel_query( + youtube, + "q", + None, + "a", + "b", + seen, + min_duration_seconds=0, + ) + assert out == [] diff --git a/cppa_youtube_script_tracker/tests/test_run_command.py b/cppa_youtube_script_tracker/tests/test_run_command.py index dfb186ec..7d487516 100644 --- a/cppa_youtube_script_tracker/tests/test_run_command.py +++ b/cppa_youtube_script_tracker/tests/test_run_command.py @@ -16,6 +16,7 @@ Command, _enrich_speakers_from_transcript, _move_to_raw, + _persist_fetched_video, _persist_video, _process_queue, _read_text_file, @@ -288,6 +289,22 @@ def test_collector_sync_pinecone_invokes_run(monkeypatch): spy.assert_called_once_with(app_id="myapp", namespace="mynamespace") +@pytest.mark.django_db +def test_collector_sync_pinecone_skips_on_dry_run(monkeypatch): + spy = MagicMock() + monkeypatch.setattr(f"{_CMD}._run_pinecone_sync", spy) + collector = CppaYoutubeScriptTrackerCollector( + cmd=Command(stdout=StringIO(), stderr=StringIO()), + options={ + "dry_run": True, + "pinecone_app_id": "x", + "pinecone_namespace": "y", + }, + ) + collector.sync_pinecone() + spy.assert_not_called() + + @pytest.mark.django_db def test_command_handle_invokes_collector_phases(): collector = MagicMock() @@ -327,3 +344,380 @@ def test_command_phase_outputs(monkeypatch): ) collector.run() assert "Phase 2" in stdout.getvalue() + + +@pytest.mark.django_db +def test_run_phase_3_download_vtt_returns_none(monkeypatch, tmp_path): + YouTubeVideo.objects.create(video_id="nofile", title="t", has_transcript=False) + monkeypatch.setattr(f"{_CMD}.get_raw_transcripts_dir", lambda: tmp_path) + monkeypatch.setattr(f"{_CMD}.download_vtt", lambda *_a, **_k: None) + + ok, fail = _run_phase_3() + assert ok == 0 and fail == 1 + assert YouTubeVideo.objects.get(video_id="nofile").has_transcript is False + + +@pytest.mark.django_db +def test_run_phase_3_download_vtt_raises(monkeypatch, tmp_path, caplog): + import logging + + YouTubeVideo.objects.create(video_id="boom", title="t", has_transcript=False) + monkeypatch.setattr(f"{_CMD}.get_raw_transcripts_dir", lambda: tmp_path) + + def _boom(*_a, **_k): + raise RuntimeError("yt-dlp simulated failure") + + monkeypatch.setattr(f"{_CMD}.download_vtt", _boom) + caplog.set_level(logging.ERROR) + + ok, fail = _run_phase_3() + assert ok == 0 and fail == 1 + assert any("transcript download failed" in r.getMessage() for r in caplog.records) + + +@pytest.mark.django_db +def test_persist_fetched_video_exception_leaves_queue_json(tmp_path, monkeypatch): + meta_dir = tmp_path / "queue" + meta_dir.mkdir(parents=True) + + monkeypatch.setattr( + f"{_CMD}.get_metadata_queue_path", lambda vid: meta_dir / f"{vid}.json" + ) + monkeypatch.setattr( + f"{_CMD}._persist_video", + lambda _data: (_ for _ in ()).throw(RuntimeError("db")), + ) + + created, skipped = _persist_fetched_video( + { + "video_id": "qvid", + "channel_id": "c1", + "channel_title": "CppCon", + "title": "T", + "description": "", + "tags": [], + } + ) + assert created is False and skipped is True + qpath = meta_dir / "qvid.json" + assert qpath.exists() + payload = json.loads(qpath.read_text(encoding="utf-8")) + assert payload["video_id"] == "qvid" + + +@pytest.mark.django_db +def test_process_queue_persist_exception_leaves_queue_file(tmp_path, monkeypatch): + meta = tmp_path / "metadata" + meta.mkdir(parents=True) + qfile = meta / "baditem.json" + qfile.write_text( + json.dumps( + { + "video_id": "bad1", + "channel_id": "c1", + "channel_title": "CppCon", + "title": "Hi", + "description": "", + "tags": [], + } + ), + encoding="utf-8", + ) + raw_meta = tmp_path / "raw_meta" + raw_meta.mkdir(parents=True) + + monkeypatch.setattr(f"{_CMD}.iter_metadata_queue_jsons", lambda: [qfile]) + + def _persist_raises(_item): + raise RuntimeError("persist boom") + + monkeypatch.setattr(f"{_CMD}._persist_video", _persist_raises) + monkeypatch.setattr( + f"{_CMD}.get_raw_metadata_path", lambda vid: raw_meta / f"{vid}.json" + ) + + processed, skipped = _process_queue() + assert processed == 1 + assert skipped >= 1 + assert qfile.exists() + + +@pytest.mark.django_db +def test_enrich_speakers_from_transcript_empty_file(tmp_path): + ch = YouTubeChannel.objects.create(channel_id="c66", channel_title="Chan") + video = YouTubeVideo.objects.create( + video_id="enr_empty", + channel=ch, + title="T", + description="", + ) + tr = tmp_path / "empty.vtt" + tr.write_text("", encoding="utf-8") + from cppa_youtube_script_tracker.models import YouTubeVideoSpeaker + + before = YouTubeVideoSpeaker.objects.filter(video=video).count() + _enrich_speakers_from_transcript(video, str(tr)) + assert YouTubeVideoSpeaker.objects.filter(video=video).count() == before + + +@pytest.mark.django_db +def test_enrich_speakers_from_transcript_resolve_empty(monkeypatch, tmp_path): + ch = YouTubeChannel.objects.create(channel_id="c77", channel_title="Chan") + video = YouTubeVideo.objects.create( + video_id="enr_none", + channel=ch, + title="T", + description="", + ) + tr = tmp_path / "some.vtt" + tr.write_text("WEBVTT\n\nnote\n", encoding="utf-8") + monkeypatch.setattr(f"{_CMD}.resolve_speakers", lambda **_k: []) + + _enrich_speakers_from_transcript(video, str(tr)) + + +@pytest.mark.django_db +def test_enrich_speakers_from_transcript_link_failure_logged( + monkeypatch, tmp_path, caplog +): + import logging + + ch = YouTubeChannel.objects.create(channel_id="c88", channel_title="Chan") + video = YouTubeVideo.objects.create( + video_id="enr_warn", + channel=ch, + title="T", + description="", + ) + tr = tmp_path / "w.vtt" + tr.write_text("WEBVTT\n\n00:00:00.000 --> 00:00:01.000\nX\n", encoding="utf-8") + monkeypatch.setattr(f"{_CMD}.resolve_speakers", lambda **_k: ["Pat"]) + calls = {"n": 0} + + def _bad_speaker(*_a, **_k): + calls["n"] += 1 + if calls["n"] == 1: + raise RuntimeError("speaker create failed") + from cppa_user_tracker.services import get_or_create_youtube_speaker as real + + return real(*_a, **_k) + + monkeypatch.setattr(f"{_CMD}.get_or_create_youtube_speaker", _bad_speaker) + caplog.set_level(logging.WARNING) + + _enrich_speakers_from_transcript(video, str(tr)) + assert any("could not link speaker" in r.getMessage() for r in caplog.records) + + +@pytest.mark.django_db +def test_command_phase_3_skip_transcript_stdout(): + stdout = StringIO() + cmd = Command(stdout=stdout, stderr=StringIO()) + cmd._phase_3(skip_transcript=True) + assert "skipped" in stdout.getvalue().lower() + + +@pytest.mark.django_db +def test_command_phase_3_stdout_ok_fail(monkeypatch): + stdout = StringIO() + cmd = Command(stdout=stdout, stderr=StringIO()) + monkeypatch.setattr(f"{_CMD}._run_phase_3", lambda: (2, 3)) + cmd._phase_3(skip_transcript=False) + out = stdout.getvalue() + assert "2" in out and "3" in out and "downloaded" in out.lower() + + +def test_run_pinecone_sync_skips_empty_namespace(caplog): + import logging + + caplog.set_level(logging.WARNING) + _run_pinecone_sync("myapp", "") + assert any("namespace" in r.message.lower() for r in caplog.records) + + +def test_run_pinecone_sync_call_command_exception_logged(caplog): + import logging + + def _raise(*_a, **_k): + raise RuntimeError("pinecone down") + + caplog.set_level(logging.WARNING) + with patch(f"{_CMD}.call_command", side_effect=_raise): + _run_pinecone_sync("app", "ns") + assert any("pinecone" in r.message.lower() for r in caplog.records) + + +@pytest.mark.django_db +def test_collector_propagates_unhandled_phase_error(monkeypatch): + stdout = StringIO() + cmd = Command(stdout=stdout, stderr=StringIO()) + + monkeypatch.setattr(cmd, "_phase_1", lambda _dry: None) + + def _boom(*_a, **_k): + raise RuntimeError("phase2 failed") + + monkeypatch.setattr(cmd, "_phase_2", _boom) + + collector = CppaYoutubeScriptTrackerCollector( + cmd=cmd, + options={ + "start_time": "2024-01-01T00:00:00Z", + "end_time": "2024-02-01T00:00:00Z", + "channel_title": "", + "dry_run": False, + "skip_transcript": True, + }, + ) + with pytest.raises(RuntimeError, match="phase2 failed"): + collector.run() + + +@pytest.mark.django_db +def test_persist_video_skips_blank_tags(monkeypatch): + monkeypatch.setattr(f"{_CMD}.resolve_speakers", lambda **_: ["S"]) + _created, skipped = _persist_video( + { + "video_id": "tagblank", + "channel_id": "c1", + "channel_title": "CppCon", + "title": "T", + "description": "", + "tags": ["", " ", "real"], + } + ) + assert skipped is False + from cppa_youtube_script_tracker.models import YouTubeVideoTags + + links = YouTubeVideoTags.objects.filter(youtube_video_id="tagblank") + assert links.count() == 1 + + +@pytest.mark.django_db +def test_process_queue_json_read_failure_logged(tmp_path, monkeypatch, caplog): + import logging + + bad = tmp_path / "bad.json" + bad.write_text("{not json", encoding="utf-8") + monkeypatch.setattr(f"{_CMD}.iter_metadata_queue_jsons", lambda: [bad]) + caplog.set_level(logging.ERROR) + processed, _ = _process_queue() + assert processed == 0 + assert any("failed to read" in r.getMessage() for r in caplog.records) + + +@pytest.mark.django_db +def test_process_queue_skipped_video_increments_counter(tmp_path, monkeypatch): + meta = tmp_path / "metadata" + meta.mkdir(parents=True) + qfile = meta / "skipcount.json" + qfile.write_text( + json.dumps( + { + "video_id": "", + "channel_id": "c1", + "channel_title": "CppCon", + "title": "Hi", + "description": "", + "tags": [], + } + ), + encoding="utf-8", + ) + raw_meta = tmp_path / "raw_meta" + raw_meta.mkdir(parents=True) + monkeypatch.setattr(f"{_CMD}.iter_metadata_queue_jsons", lambda: [qfile]) + monkeypatch.setattr( + f"{_CMD}.get_raw_metadata_path", lambda vid: raw_meta / f"{vid}.json" + ) + processed, skipped = _process_queue() + assert processed == 1 + assert skipped >= 1 + + +@pytest.mark.django_db +def test_persist_fetched_video_empty_video_id(): + created, skipped = _persist_fetched_video({"video_id": ""}) + assert created is False and skipped is True + + +@pytest.mark.django_db +def test_command_phase_1_writes_summary(monkeypatch): + stdout = StringIO() + cmd = Command(stdout=stdout, stderr=StringIO()) + monkeypatch.setattr(f"{_CMD}._process_queue", lambda: (2, 1)) + cmd._phase_1(dry_run=False) + out = stdout.getvalue() + assert "Phase 1" in out and "2" in out and "1" in out + + +@pytest.mark.django_db +def test_command_phase_2_success_branch(monkeypatch): + stdout = StringIO() + cmd = Command(stdout=stdout, stderr=StringIO()) + monkeypatch.setattr(f"{_CMD}._run_phase_2", lambda *a: (3, 2)) + cmd._phase_2( + datetime(2024, 1, 1, tzinfo=timezone.utc), + datetime(2024, 2, 1, tzinfo=timezone.utc), + "", + ) + out = stdout.getvalue().lower() + assert "created" in out and "skipped" in out + + +@pytest.mark.django_db +def test_run_phase_2_counts_skipped(monkeypatch): + monkeypatch.setattr( + f"{_CMD}.fetch_videos", + lambda **_: [ + { + "video_id": "a", + "channel_id": "c", + "channel_title": "CppCon", + "title": "A", + }, + { + "video_id": "b", + "channel_id": "c", + "channel_title": "CppCon", + "title": "B", + }, + ], + ) + monkeypatch.setattr(f"{_CMD}.resolve_speakers", lambda **_: ["X"]) + monkeypatch.setattr( + f"{_CMD}._persist_fetched_video", + lambda d: (True, False) if d["video_id"] == "a" else (False, True), + ) + c, s = _run_phase_2( + datetime(2024, 1, 1, tzinfo=timezone.utc), + datetime(2024, 2, 1, tzinfo=timezone.utc), + "CppCon", + ) + assert c == 1 and s == 1 + + +@pytest.mark.django_db +def test_persist_video_logs_validation_error(monkeypatch, caplog): + import logging + from django.core.exceptions import ValidationError + + caplog.set_level(logging.WARNING) + monkeypatch.setattr( + f"{_CMD}.get_or_create_video", + lambda *_a, **_k: (_ for _ in ()).throw(ValidationError("bad")), + ) + monkeypatch.setattr(f"{_CMD}.get_or_create_channel", lambda *_a, **_k: None) + + created, skipped = _persist_video( + { + "video_id": "valerr", + "channel_id": "", + "channel_title": "", + "title": "T", + "description": "", + "tags": [], + } + ) + assert created is False and skipped is True + assert any("validation error" in r.message.lower() for r in caplog.records) diff --git a/cppa_youtube_script_tracker/tests/test_services.py b/cppa_youtube_script_tracker/tests/test_services.py index e87451de..e41934de 100644 --- a/cppa_youtube_script_tracker/tests/test_services.py +++ b/cppa_youtube_script_tracker/tests/test_services.py @@ -102,3 +102,38 @@ def test_get_or_create_tag_normalizes_case(): def test_get_or_create_tag_empty_raises(): with pytest.raises(ValueError, match="tag_name"): get_or_create_tag(" ") + + +@pytest.mark.django_db +def test_get_or_create_video_sparse_metadata_defaults(): + video, created = get_or_create_video("sparse1", None, {}) + assert created is True + assert video.title == "" + assert video.description == "" + assert video.duration_seconds == 0 + assert video.view_count is None + assert video.like_count is None + assert video.comment_count is None + assert video.published_at is None + assert video.scraped_at is None + + +@pytest.mark.django_db +def test_get_or_create_video_unparseable_datetime_strings_become_none(): + """django.utils.dateparse.parse_datetime returns None for invalid ISO strings.""" + video, _ = get_or_create_video( + "sparse2", + None, + {"published_at": "not-a-date", "scraped_at": "also-bad"}, + ) + assert video.published_at is None + assert video.scraped_at is None + + +@pytest.mark.django_db +def test_get_or_create_channel_empty_title_does_not_clear_existing(): + _ = get_or_create_channel("keep_title", "CppCon") + ch2 = get_or_create_channel("keep_title", "") + assert ch2.pk == "keep_title" + ch2.refresh_from_db() + assert ch2.channel_title == "CppCon"