diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 04fa7cda..296c08f6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,10 @@ Change Log Unreleased +[3.0.0] - 2026-05-25 +--------------------- +* feat: construct and index Spanish Algolia objects for skills, jobs and industries + [2.4.0] - 2026-03-26 --------------------- * fix: bump lightcast version from 8.9 to 9.41 diff --git a/taxonomy/__init__.py b/taxonomy/__init__.py index 6ac9c2da..d8189320 100644 --- a/taxonomy/__init__.py +++ b/taxonomy/__init__.py @@ -15,4 +15,4 @@ # 2. MINOR version when you add functionality in a backwards compatible manner, and # 3. PATCH version when you make backwards compatible bug fixes. # More details can be found at https://semver.org/ -__version__ = '2.4.0' +__version__ = '3.0.0' diff --git a/taxonomy/algolia/constants.py b/taxonomy/algolia/constants.py index ca7d4793..1e4fc23b 100644 --- a/taxonomy/algolia/constants.py +++ b/taxonomy/algolia/constants.py @@ -17,6 +17,10 @@ 'searchable(industry_names)', 'searchable(b2c_opt_in)', 'searchable(job_sources)', + 'metadata_language', + ], + 'customRanking': [ + 'asc(metadata_language)', ], } @@ -29,3 +33,5 @@ JOBS_TO_IGNORE = [ 'ET0000000000000000', # 'Unclassified' job ] + +TAXONOMY_TRANSLATION_LOCALES = ['es'] diff --git a/taxonomy/algolia/utils.py b/taxonomy/algolia/utils.py index c2143a95..a7f751cf 100644 --- a/taxonomy/algolia/utils.py +++ b/taxonomy/algolia/utils.py @@ -15,9 +15,10 @@ EMBEDDED_OBJECT_LENGTH_CAP, JOBS_PAGE_SIZE, JOBS_TO_IGNORE, + TAXONOMY_TRANSLATION_LOCALES, ) from taxonomy.algolia.serializers import JobSerializer -from taxonomy.models import Industry, IndustryJobSkill, Job, JobSkills +from taxonomy.models import Industry, IndustryJobSkill, Job, JobSkills, Skill, TaxonomyTranslation LOGGER = logging.getLogger(__name__) @@ -52,13 +53,14 @@ def __exit__(self, *args, **kwargs): def index_jobs_data_in_algolia(): """ - Re-Index all jobs data to algolia. + Re-Index all jobs data to algolia with translations. - This function is responsible for - 1. Constructing a list of dicts containing all jobs present in the database. - 2. Re-Indexing all data in a single atomic operations with zero downtime. + This function is responsible for: + 1. Constructing a list of dicts containing all jobs present in the database (English). + 2. Creating localized variants for enabled languages (e.g., Spanish). + 3. Re-Indexing all data in a single atomic operation with zero downtime. - Note: We need to construct a list of all jobs in the form of a list and the send it all in a single attempt to + Note: We need to construct a list of all jobs in the form of a list and send it all in a single attempt to make the operation atomic and make sure there is no downtime. Paginating DB data and incrementally adding objects to the index will cause downtime equal to the amount of time it will take to run the command. """ @@ -67,12 +69,23 @@ def index_jobs_data_in_algolia(): api_key=settings.ALGOLIA.get('API_KEY'), index_name=settings.ALGOLIA.get('TAXONOMY_INDEX_NAME'), ) + LOGGER.info('[TAXONOMY] Resetting algolia index settings from code.') client.set_index_settings(ALGOLIA_JOBS_INDEX_SETTINGS) LOGGER.info('[TAXONOMY] Fetching Jobs data from the database.') jobs_data = fetch_jobs_data() LOGGER.info('[TAXONOMY] Jobs data successfully fetched from the database.') + LOGGER.info(f'[TAXONOMY] Total English job records: {len(jobs_data)}') + + for language in TAXONOMY_TRANSLATION_LOCALES: + LOGGER.info(f'[TAXONOMY] Creating {language} job records.') + localized_jobs = create_localized_job_records(jobs_data, language) + jobs_data.extend(localized_jobs) + LOGGER.info(f'[TAXONOMY] Added {len(localized_jobs)} {language} records.') + + LOGGER.info(f'[TAXONOMY] Total records (all languages): {len(jobs_data)}') + LOGGER.info('[TAXONOMY] Indexing Jobs data on algolia.') client.replace_all_objects(jobs_data) LOGGER.info('[TAXONOMY] Jobs data successfully indexed on algolia.') @@ -236,6 +249,266 @@ def get_job_ids(qs): return jobs +def build_name_translation_maps(language_code): + """ + Build direct name→translation dictionaries using database queries. + + Uses database queries to map English entity names directly to translations, + avoiding the need for two-step lookups (name→id→translation). + + Args: + language_code: Target language (e.g., 'es') + + Returns: + dict: { + 'job': {english_name: translated_name}, + 'skill': {english_name: translated_name}, + 'industry': {english_name: translated_name}, + } + """ + + LOGGER.info(f'[TAXONOMY] Building {language_code} translation maps from database.') + + # Fetch all translations for the language + job_trans_qs = TaxonomyTranslation.objects.filter( + content_type='job', + language_code=language_code + ) + skill_trans_qs = TaxonomyTranslation.objects.filter( + content_type='skill', + language_code=language_code + ) + industry_trans_qs = TaxonomyTranslation.objects.filter( + content_type='industry', + language_code=language_code + ) + + # Build mappings: external_id to translation + job_trans_by_id = {t.external_id: t for t in job_trans_qs} + skill_trans_by_id = {t.external_id: t for t in skill_trans_qs} + industry_trans_by_id = {t.external_id: t for t in industry_trans_qs} + + # Job: English name to Translated name + job_translations = {} + for job in Job.objects.exclude(Q(name__isnull=True) | Q(external_id__in=JOBS_TO_IGNORE)): + trans = job_trans_by_id.get(job.external_id) + if trans and trans.title: + job_translations[job.name] = trans.title + + # Skill: English name to Translated name + skill_translations = {} + for skill in Skill.objects.exclude(external_id__isnull=True): + trans = skill_trans_by_id.get(skill.external_id) + if trans and trans.title: + skill_translations[skill.name] = trans.title + + # Industry: English name to Translated name + industry_translations = {} + for industry in Industry.objects.exclude(code__isnull=True): + trans = industry_trans_by_id.get(str(industry.code)) + if trans and trans.title: + industry_translations[industry.name] = trans.title + + LOGGER.info( + f'[TAXONOMY] Built translation maps: {len(job_translations)} jobs, ' + f'{len(skill_translations)} skills, {len(industry_translations)} industries' + ) + + return { + 'job': job_translations, + 'skill': skill_translations, + 'industry': industry_translations, + } + + +def translate_skill_dict(skill, name_translation_maps): + """ + Translate a single skill dict using direct name lookup (keeps same schema). + + Args: + skill: Dict with skill data from JobSerializer + name_translation_maps: Direct name→translation dictionaries + + Returns: + Dict with translated skill data (same schema as input) + """ + skill_name = skill.get('name', '') + + # Direct lookup: English name to Translated name + translated_name = name_translation_maps['skill'].get(skill_name, skill_name) + + return { + **skill, # Copy all fields (significance, type_id, description, etc.) + 'name': translated_name, + } + + +def translate_industries_array(industries, name_translation_maps): + """ + Translate industries array with nested skills using direct name lookup (keeps same schema). + + Args: + industries: List of industry dicts from JobSerializer + name_translation_maps: Direct name→translation dictionaries + + Returns: + List of translated industry dicts (same schema as input) + """ + translated_industries = [] + + for industry in industries: + industry_name = industry.get('name', '') + + # Direct lookup: English industry name to Translated industry name + translated_industry_name = name_translation_maps['industry'].get(industry_name, industry_name) + + # Translate nested skills (they are plain strings in current schema) + translated_skills = [ + name_translation_maps['skill'].get(skill_name, skill_name) + for skill_name in industry.get('skills', []) + ] + + translated_industries.append({ + 'name': translated_industry_name, + 'skills': translated_skills # Keep as list of strings + }) + + return translated_industries + + +def translate_job_record(english_job, name_translation_maps, description_translation_maps, language_code): + """ + Translate a single job record using direct name lookups - creates duplicate with same schema. + + Args: + english_job: Dict with English job data (from JobSerializer) + name_translation_maps: Direct name to translated_name dictionaries + description_translation_maps: {content_type: {external_id: TaxonomyTranslation}} for descriptions + language_code: Target language code (e.g., 'es') + + Returns: + Dict with translated job data (SAME SCHEMA as English) + """ + external_id = english_job.get('external_id') + job_name = english_job.get('name', '') + + # Direct name translation + translated_job_name = name_translation_maps['job'].get(job_name, job_name) + + # Description requires external_id lookup (not included in name maps) + job_trans = description_translation_maps.get('job', {}).get(external_id) + translated_description = ( + job_trans.description if (job_trans and job_trans.description) + else english_job.get('description', '') + ) + + # Create localized copy with IDENTICAL schema + localized_job = { + # Metadata - change objectID for localized variant + 'objectID': f"job-{external_id}-{language_code}", + 'id': english_job.get('id'), + 'external_id': external_id, + 'metadata_language': language_code, + + # Translated top-level fields + 'name': translated_job_name, + 'description': translated_description, + + # Translate skills array (keep same schema - dicts with name, description, etc.) + 'skills': [ + translate_skill_dict(skill, name_translation_maps) + for skill in english_job.get('skills', []) + ], + + # Job postings (no translation - copy as-is) + 'job_postings': english_job.get('job_postings', []), + + # Translate industry_names (direct name lookup - list of strings) + 'industry_names': [ + name_translation_maps['industry'].get(ind_name, ind_name) + for ind_name in english_job.get('industry_names', []) + ], + + # Translate industries with nested skills (keep same schema) + 'industries': translate_industries_array( + english_job.get('industries', []), + name_translation_maps + ), + + # Translate similar_jobs (direct name lookup - list of strings) + 'similar_jobs': [ + name_translation_maps['job'].get(job_name, job_name) + for job_name in english_job.get('similar_jobs', []) + ], + + # Non-translatable fields (copy as-is) + 'b2c_opt_in': english_job.get('b2c_opt_in', False), + 'job_sources': english_job.get('job_sources', []), + } + + return localized_job + + +def create_localized_job_records(english_jobs, language_code): + """ + Create localized variants of English job records using direct name translation. + + This function: + 1. Builds direct name→translation dictionaries (English→Spanish) + 2. Fetches description translations separately (requires external_id) + 3. Creates duplicate job records with translated content using O(1) lookups + + Args: + english_jobs: List of serialized English job dicts + language_code: Target language code (e.g., 'es', 'fr', 'ar') + + Returns: + List of localized job dicts (SAME SCHEMA as English, different content) + """ + + LOGGER.info(f'[TAXONOMY] Building {language_code} translation maps.') + + # Build direct name→translation maps (job/skill/industry names) + name_translation_maps = build_name_translation_maps(language_code) + + # Build description translation maps (requires external_id lookup) + all_translations = TaxonomyTranslation.objects.filter( + language_code=language_code + ) + + description_translation_maps = { + 'job': {}, + 'skill': {}, + 'industry': {}, + } + + for trans in all_translations: + description_translation_maps[trans.content_type][trans.external_id] = trans + + LOGGER.info( + f'[TAXONOMY] Loaded {len(name_translation_maps["job"])} job name, ' + f'{len(name_translation_maps["skill"])} skill name, ' + f'{len(name_translation_maps["industry"])} industry name translations.' + ) + + # Create localized variants (duplicate records with translated content) + localized_jobs = [] + for idx, english_job in enumerate(english_jobs, 1): + if idx % 1000 == 0: + LOGGER.info(f'[TAXONOMY] Translated {idx}/{len(english_jobs)} jobs to {language_code}') + + localized_job = translate_job_record( + english_job, + name_translation_maps, + description_translation_maps, + language_code + ) + localized_jobs.append(localized_job) + + LOGGER.info(f'[TAXONOMY] Completed translating {len(localized_jobs)} jobs to {language_code}') + return localized_jobs + + def fetch_jobs_data(): """ Construct a list of all the jobs from the database. @@ -266,6 +539,9 @@ def fetch_jobs_data(): 'jobs_having_industry_skills': get_job_ids(IndustryJobSkill.get_whitelisted_job_skill_qs()), }, ) + # Add metadata_language to English records + for job_data in job_serializer.data: + job_data['metadata_language'] = 'en' jobs.extend(job_serializer.data) start += page_size diff --git a/taxonomy/tests/test_algolia_translations.py b/taxonomy/tests/test_algolia_translations.py new file mode 100644 index 00000000..338f472f --- /dev/null +++ b/taxonomy/tests/test_algolia_translations.py @@ -0,0 +1,519 @@ +# -*- coding: utf-8 -*- +""" +Tests for Algolia translation utilities. +""" +import logging +from unittest.mock import MagicMock + +import pytest + +import taxonomy.algolia.utils as algolia_utils +from taxonomy.algolia.utils import ( + build_name_translation_maps, + create_localized_job_records, + fetch_jobs_data, + index_jobs_data_in_algolia, + translate_industries_array, + translate_job_record, + translate_skill_dict, +) +from taxonomy.models import Industry, IndustryJobSkill, Job, JobSkills, Skill, TaxonomyTranslation + + +@pytest.mark.django_db +class TestBuildNameTranslationMaps: + """Test building translation maps.""" + + @pytest.mark.parametrize('content_type,model_class,external_id,name,translation', [ + ('job', Job, 'ET123', 'Software Engineer', 'Ingeniero de Software'), + ('skill', Skill, 'ES123', 'Python', 'Python (Programación)'), + ('industry', Industry, '54', 'Information Technology', 'Tecnología de la Información'), + ]) + def test_builds_translations(self, content_type, model_class, external_id, name, translation): + """Test building translations for jobs, skills, and industries.""" + if model_class == Industry: + model_class.objects.create(code=external_id, name=name) + else: + model_class.objects.create(external_id=external_id, name=name) + + TaxonomyTranslation.objects.create( + external_id=external_id, + content_type=content_type, + language_code='es', + title=translation + ) + + maps = build_name_translation_maps('es') + + assert maps[content_type][name] == translation + + def test_skips_empty_translations(self): + """Test that empty translations are not included in maps.""" + job = Job.objects.create(external_id='ET123', name='Software Engineer') + TaxonomyTranslation.objects.create( + external_id='ET123', + content_type='job', + language_code='es', + title='' # Empty translation + ) + + maps = build_name_translation_maps('es') + + # Empty translation should not be in the map + assert 'Software Engineer' not in maps['job'] + + def test_returns_empty_maps_when_no_translations(self): + """Test returns empty dicts when no translations exist.""" + Job.objects.create(external_id='ET123', name='Software Engineer') + + maps = build_name_translation_maps('es') + + assert maps['job'] == {} + assert maps['skill'] == {} + assert maps['industry'] == {} + + +@pytest.mark.django_db +class TestTranslateSkillDict: + """Test skill dict translation.""" + + def test_translates_skill_name(self): + """Test skill name is translated and all fields are preserved.""" + skill = { + 'name': 'Python', + 'description': 'Programming language', + 'significance': 85, + 'type_id': 'ST1' + } + name_maps = { + 'skill': {'Python': 'Python (Lenguaje)'} + } + + result = translate_skill_dict(skill, name_maps) + + assert result['name'] == 'Python (Lenguaje)' + assert result['description'] == 'Programming language' + assert result['significance'] == 85 + assert result['type_id'] == 'ST1' + + def test_fallback_when_translation_missing(self): + """Test falls back to English when translation not found.""" + skill = {'name': 'JavaScript'} + name_maps = {'skill': {'Python': 'Python (ES)'}} + + result = translate_skill_dict(skill, name_maps) + + assert result['name'] == 'JavaScript' + + +@pytest.mark.django_db +class TestTranslateIndustriesArray: + """Test industries array translation.""" + + def test_translates_industry_names(self): + """Test industry names are translated.""" + industries = [ + { + 'name': 'Information Technology', + 'skills': [] + } + ] + name_maps = { + 'industry': {'Information Technology': 'Tecnología de la Información'}, + 'skill': {} + } + + result = translate_industries_array(industries, name_maps) + + assert result[0]['name'] == 'Tecnología de la Información' + + def test_translates_nested_skills(self): + """Test nested skills are translated.""" + industries = [ + { + 'name': 'IT', + 'skills': ['Python', 'Java', 'Cloud Computing'] + } + ] + name_maps = { + 'industry': {'IT': 'TI'}, + 'skill': { + 'Python': 'Python (Programación)', + 'Cloud Computing': 'Computación en la Nube' + } + } + + result = translate_industries_array(industries, name_maps) + + assert result[0]['name'] == 'TI' + assert result[0]['skills'][0] == 'Python (Programación)' + assert result[0]['skills'][1] == 'Java' # Fallback + assert result[0]['skills'][2] == 'Computación en la Nube' + + def test_handles_empty_industries(self): + """Test handles empty industries list.""" + result = translate_industries_array([], {'industry': {}, 'skill': {}}) + + assert result == [] + + +@pytest.mark.django_db +class TestTranslateJobRecord: + """Test job record translation.""" + + def test_translates_job_name(self): + """Test job name is translated.""" + Job.objects.create(external_id='ET123', name='Software Engineer') + TaxonomyTranslation.objects.create( + external_id='ET123', + content_type='job', + language_code='es', + title='Ingeniero de Software', + description='Desarrolla software' + ) + + english_job = { + 'objectID': 'job-ET123', + 'id': 1, + 'external_id': 'ET123', + 'name': 'Software Engineer', + 'description': 'Develops software', + 'skills': [], + 'job_postings': [], + 'industry_names': [], + 'industries': [], + 'similar_jobs': [], + 'b2c_opt_in': True, + 'job_sources': ['course_skill'] + } + + name_maps = {'job': {'Software Engineer': 'Ingeniero de Software'}, 'skill': {}, 'industry': {}} + desc_maps = { + 'job': { + 'ET123': TaxonomyTranslation.objects.get(external_id='ET123') + }, + 'skill': {}, + 'industry': {} + } + + result = translate_job_record(english_job, name_maps, desc_maps, 'es') + + assert result['objectID'] == 'job-ET123-es' + assert result['name'] == 'Ingeniero de Software' + assert result['description'] == 'Desarrolla software' + assert result['metadata_language'] == 'es' + + def test_translates_all_nested_fields(self): + """Test all nested arrays are translated.""" + english_job = { + 'objectID': 'job-ET123', + 'id': 1, + 'external_id': 'ET123', + 'name': 'Engineer', + 'description': 'Desc', + 'skills': [ + {'name': 'Python', 'significance': 90} + ], + 'job_postings': [{'id': 1}], + 'industry_names': ['IT', 'Software'], + 'industries': [ + {'name': 'IT', 'skills': ['Cloud']} + ], + 'similar_jobs': ['Senior Engineer', 'Architect'], + 'b2c_opt_in': False, + 'job_sources': ['job_skill'] + } + + name_maps = { + 'job': {'Engineer': 'Ingeniero', 'Senior Engineer': 'Ingeniero Senior'}, + 'skill': {'Python': 'Python (ES)', 'Cloud': 'Nube'}, + 'industry': {'IT': 'TI', 'Software': 'Software'} + } + desc_maps = {'job': {}, 'skill': {}, 'industry': {}} + + result = translate_job_record(english_job, name_maps, desc_maps, 'es') + + # Check skills translated + assert result['skills'][0]['name'] == 'Python (ES)' + assert result['skills'][0]['significance'] == 90 + + # Check industry_names translated + assert result['industry_names'] == ['TI', 'Software'] + + # Check industries with nested skills translated + assert result['industries'][0]['name'] == 'TI' + assert result['industries'][0]['skills'] == ['Nube'] + + # Check similar_jobs translated + assert result['similar_jobs'][0] == 'Ingeniero Senior' + assert result['similar_jobs'][1] == 'Architect' # Fallback + + def test_preserves_non_translatable_fields(self): + """Test non-translatable fields are preserved.""" + english_job = { + 'objectID': 'job-ET123', + 'id': 1, + 'external_id': 'ET123', + 'name': 'Engineer', + 'description': '', + 'skills': [], + 'job_postings': [{'id': 1, 'url': 'http://example.com'}], + 'industry_names': [], + 'industries': [], + 'similar_jobs': [], + 'b2c_opt_in': True, + 'job_sources': ['course_skill', 'job_skill'] + } + + name_maps = {'job': {}, 'skill': {}, 'industry': {}} + desc_maps = {'job': {}, 'skill': {}, 'industry': {}} + + result = translate_job_record(english_job, name_maps, desc_maps, 'es') + + assert result['job_postings'] == [{'id': 1, 'url': 'http://example.com'}] + assert result['b2c_opt_in'] is True + assert result['job_sources'] == ['course_skill', 'job_skill'] + assert result['id'] == 1 + assert result['external_id'] == 'ET123' + + def test_fallback_to_english_when_no_description(self): + """Test falls back to English description when translation empty.""" + english_job = { + 'objectID': 'job-ET123', + 'id': 1, + 'external_id': 'ET123', + 'name': 'Engineer', + 'description': 'English description', + 'skills': [], + 'job_postings': [], + 'industry_names': [], + 'industries': [], + 'similar_jobs': [], + 'b2c_opt_in': False, + 'job_sources': [] + } + + name_maps = {'job': {}, 'skill': {}, 'industry': {}} + desc_maps = {'job': {}, 'skill': {}, 'industry': {}} + + result = translate_job_record(english_job, name_maps, desc_maps, 'es') + + assert result['description'] == 'English description' + + +@pytest.mark.django_db +class TestCreateLocalizedJobRecords: + """Test creating localized job records.""" + + def test_creates_spanish_records(self): + """Test creates Spanish variant of English jobs.""" + job = Job.objects.create(external_id='ET123', name='Engineer') + TaxonomyTranslation.objects.create( + external_id='ET123', + content_type='job', + language_code='es', + title='Ingeniero', + description='Descripción' + ) + + english_jobs = [{ + 'objectID': 'job-ET123', + 'id': 1, + 'external_id': 'ET123', + 'name': 'Engineer', + 'description': 'Description', + 'skills': [], + 'job_postings': [], + 'industry_names': [], + 'industries': [], + 'similar_jobs': [], + 'b2c_opt_in': False, + 'job_sources': [] + }] + + spanish_jobs = create_localized_job_records(english_jobs, 'es') + + assert len(spanish_jobs) == 1 + assert spanish_jobs[0]['objectID'] == 'job-ET123-es' + assert spanish_jobs[0]['name'] == 'Ingeniero' + assert spanish_jobs[0]['description'] == 'Descripción' + assert spanish_jobs[0]['metadata_language'] == 'es' + + def test_creates_multiple_records(self): + """Test creates translations for multiple jobs.""" + Job.objects.create(external_id='ET1', name='Engineer') + Job.objects.create(external_id='ET2', name='Designer') + + TaxonomyTranslation.objects.create( + external_id='ET1', content_type='job', language_code='es', title='Ingeniero' + ) + TaxonomyTranslation.objects.create( + external_id='ET2', content_type='job', language_code='es', title='Diseñador' + ) + + english_jobs = [ + { + 'objectID': 'job-ET1', 'id': 1, 'external_id': 'ET1', 'name': 'Engineer', + 'description': '', 'skills': [], 'job_postings': [], 'industry_names': [], + 'industries': [], 'similar_jobs': [], 'b2c_opt_in': False, 'job_sources': [] + }, + { + 'objectID': 'job-ET2', 'id': 2, 'external_id': 'ET2', 'name': 'Designer', + 'description': '', 'skills': [], 'job_postings': [], 'industry_names': [], + 'industries': [], 'similar_jobs': [], 'b2c_opt_in': False, 'job_sources': [] + } + ] + + spanish_jobs = create_localized_job_records(english_jobs, 'es') + + assert len(spanish_jobs) == 2 + assert spanish_jobs[0]['name'] == 'Ingeniero' + assert spanish_jobs[0]['metadata_language'] == 'es' + assert spanish_jobs[1]['name'] == 'Diseñador' + assert spanish_jobs[1]['metadata_language'] == 'es' + + def test_handles_partial_translations(self): + """Test gracefully handles missing translations.""" + Job.objects.create(external_id='ET1', name='Engineer') + TaxonomyTranslation.objects.create( + external_id='ET1', content_type='job', language_code='es', title='Ingeniero' + ) + + english_jobs = [{ + 'objectID': 'job-ET1', 'id': 1, 'external_id': 'ET1', 'name': 'Engineer', + 'description': '', 'skills': [{'name': 'Python'}], 'job_postings': [], + 'industry_names': ['IT'], 'industries': [], 'similar_jobs': ['Architect'], + 'b2c_opt_in': False, 'job_sources': [] + }] + + # No skill/industry translations + spanish_jobs = create_localized_job_records(english_jobs, 'es') + + # Should fall back to English for missing translations + assert spanish_jobs[0]['name'] == 'Ingeniero' # Translated + assert spanish_jobs[0]['skills'][0]['name'] == 'Python' # Fallback + assert spanish_jobs[0]['industry_names'][0] == 'IT' # Fallback + assert spanish_jobs[0]['similar_jobs'][0] == 'Architect' # Fallback + assert spanish_jobs[0]['metadata_language'] == 'es' + + def test_adds_metadata_language_field(self): + """Test metadata_language field is added to translated jobs.""" + job = Job.objects.create(external_id='ET123', name='Engineer') + + english_jobs = [{ + 'objectID': 'job-ET123', + 'id': 1, + 'external_id': 'ET123', + 'name': 'Engineer', + 'description': 'Desc', + 'skills': [], + 'job_postings': [], + 'industry_names': [], + 'industries': [], + 'similar_jobs': [], + 'b2c_opt_in': False, + 'job_sources': [] + }] + + spanish_jobs = create_localized_job_records(english_jobs, 'es') + + assert 'metadata_language' in spanish_jobs[0] + assert spanish_jobs[0]['metadata_language'] == 'es' + + def test_returns_empty_list_when_no_jobs(self): + """Test returns empty list when no jobs provided.""" + result = create_localized_job_records([], 'es') + + assert result == [] + + def test_logs_progress_every_thousand_records(self, caplog, monkeypatch): + """Test progress log branch is hit when processing every 1000th record.""" + english_jobs = [ + { + 'objectID': f'job-{idx}', + 'id': idx, + 'external_id': f'ET{idx}', + 'name': 'Engineer', + 'description': '', + 'skills': [], + 'job_postings': [], + 'industry_names': [], + 'industries': [], + 'similar_jobs': [], + 'b2c_opt_in': False, + 'job_sources': [], + } + for idx in range(1, 1001) + ] + + monkeypatch.setattr( + algolia_utils, + 'translate_job_record', + lambda english_job, *_: { + **english_job, + 'metadata_language': 'es', + } + ) + + caplog.set_level(logging.INFO) + result = create_localized_job_records(english_jobs, 'es') + + assert len(result) == 1000 + assert any('Translated 1000/1000 jobs to es' in record.message for record in caplog.records) + + +@pytest.mark.django_db +class TestIndexJobsDataInAlgolia: + """Tests for full index build flow with localized records.""" + + def test_indexes_english_and_localized_jobs(self, monkeypatch): + """Test indexing appends localized records for each configured language.""" + client = MagicMock() + monkeypatch.setattr(algolia_utils, 'AlgoliaClient', MagicMock(return_value=client)) + + english_jobs = [{'objectID': 'job-ET1', 'name': 'Engineer', 'metadata_language': 'en'}] + monkeypatch.setattr(algolia_utils, 'fetch_jobs_data', lambda: list(english_jobs)) + monkeypatch.setattr(algolia_utils, 'TAXONOMY_TRANSLATION_LOCALES', ['es', 'fr']) + + def _create_localized(jobs_data, language_code): + return [{'objectID': f'job-ET1-{language_code}', 'name': 'Engineer', 'metadata_language': language_code}] + + monkeypatch.setattr(algolia_utils, 'create_localized_job_records', _create_localized) + + index_jobs_data_in_algolia() + + client.set_index_settings.assert_called_once() + indexed_objects = client.replace_all_objects.call_args[0][0] + assert len(indexed_objects) == 3 + assert {obj['metadata_language'] for obj in indexed_objects} == {'en', 'es', 'fr'} + + +@pytest.mark.django_db +class TestFetchJobsData: + """Tests for english jobs serialization payload.""" + + def test_adds_metadata_language_to_serialized_jobs(self, monkeypatch): + """Test serialized jobs include metadata_language='en'.""" + Job.objects.create(external_id='ET1', name='Engineer') + + monkeypatch.setattr(algolia_utils, 'fetch_and_combine_job_details', lambda _qs: {}) + monkeypatch.setattr(algolia_utils, 'combine_industry_skills', lambda: {}) + monkeypatch.setattr(algolia_utils, 'get_job_ids', lambda _qs: set()) + monkeypatch.setattr(JobSkills, 'get_whitelisted_job_skill_qs', classmethod(lambda cls: JobSkills.objects.none())) + monkeypatch.setattr( + IndustryJobSkill, + 'get_whitelisted_job_skill_qs', + classmethod(lambda cls: IndustryJobSkill.objects.none()) + ) + + class DummySerializer: + """Serializer stub for deterministic test payload.""" + + def __init__(self, *args, **kwargs): + self.data = [{'objectID': 'job-ET1', 'name': 'Engineer'}] + + monkeypatch.setattr(algolia_utils, 'JobSerializer', DummySerializer) + + jobs = fetch_jobs_data() + + assert jobs == [{'objectID': 'job-ET1', 'name': 'Engineer', 'metadata_language': 'en'}]