diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 86f537b7..8c7e184e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,11 +13,14 @@ Change Log Unreleased +[2.3.20] - 2026-01-29 +--------------------- +* feat: translate skills, jobs and industries + [2.3.19] - 2025-11-26 --------------------- * chore: upgrade python requirements - [2.3.18] - 2025-10-30 --------------------- * fix: pin `pip<25.3` to resolve make upgrade build failure diff --git a/Makefile b/Makefile index b22fc528..bb6c279e 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ pii_check: ## check for PII annotations on all Django models requirements: ## install development environment requirements pip install -qr requirements/pip.txt - pip install -q -r requirements/pip_tools.txt -c requirements/constraints.txt + pip install -q -r requirements/pip-tools.txt -c requirements/constraints.txt pip-sync requirements/dev.txt test: clean ## run tests in the current virtualenv diff --git a/taxonomy/__init__.py b/taxonomy/__init__.py index 8417a10e..24f5ef26 100644 --- a/taxonomy/__init__.py +++ b/taxonomy/__init__.py @@ -15,4 +15,4 @@ # 2. MINOR version when you add functionality in a backwards compatible manner, and # 3. PATCH version when you make backwards compatible bug fixes. # More details can be found at https://semver.org/ -__version__ = '2.3.19' +__version__ = '2.3.20' diff --git a/taxonomy/admin.py b/taxonomy/admin.py index 54ac102e..2f055b4b 100644 --- a/taxonomy/admin.py +++ b/taxonomy/admin.py @@ -11,6 +11,7 @@ from django.contrib import admin, messages from django.http import HttpResponseRedirect from django.urls import re_path, reverse +from django.utils.html import format_html from taxonomy.constants import JOB_SKILLS_URL_NAME from taxonomy.models import ( @@ -30,6 +31,7 @@ SkillsQuiz, SkillSubCategory, SkillValidationConfiguration, + TaxonomyTranslation, Translation, XBlockSkillData, XBlockSkills, @@ -301,3 +303,122 @@ class SkillValidationConfiguratonAdmin(admin.ModelAdmin): """ Admin view for SkillValidationConfiguration model. """ + + +@admin.register(TaxonomyTranslation) +class TaxonomyTranslationAdmin(admin.ModelAdmin): + """ + Admin view for TaxonomyTranslation model. + + Displays translations for jobs, skills, and industries across different languages. + """ + + list_display = ( + 'id', + 'content_type', + 'external_id', + 'language_code', + 'title_preview', + 'view_source_object', + ) + + list_filter = ( + 'content_type', + 'language_code', + ) + + search_fields = ( + 'external_id', + 'title', + 'description', + ) + + readonly_fields = ( + 'created', + 'modified', + 'source_hash', + 'source_object_link', + ) + + fieldsets = ( + ('Entity Information', { + 'fields': ('content_type', 'external_id', 'language_code', 'source_object_link'), + }), + ('Translation', { + 'fields': ('title', 'description'), + }), + ('Metadata', { + 'fields': ('source_hash', 'created', 'modified'), + 'classes': ('collapse',), + }), + ) + + ordering = ('-modified',) + + @admin.display(description='Title') + def title_preview(self, obj): + """ + Display truncated title for readability. + """ + return obj.title[:75] + '...' if len(obj.title) > 75 else obj.title + + @admin.display(description='Source Object') + def view_source_object(self, obj): + """ + Display a link to view the source object (Job, Skill, or Industry). + """ + try: + if obj.content_type == 'job': + job = Job.objects.get(external_id=obj.external_id) + url = reverse('admin:taxonomy_job_change', args=[job.pk]) + return format_html('View Job', url) + elif obj.content_type == 'skill': + skill = Skill.objects.get(external_id=obj.external_id) + url = reverse('admin:taxonomy_skill_change', args=[skill.pk]) + return format_html('View Skill', url) + elif obj.content_type == 'industry': + industry = Industry.objects.get(code=obj.external_id) + url = reverse('admin:taxonomy_industry_change', args=[industry.pk]) + return format_html('View Industry', url) + except (Job.DoesNotExist, Skill.DoesNotExist, Industry.DoesNotExist): + return '-' + + return '-' + + @admin.display(description='Source Object Details') + def source_object_link(self, obj): + """ + Display detailed link to source object in the detail view. + """ + try: + if obj.content_type == 'job': + job = Job.objects.get(external_id=obj.external_id) + url = reverse('admin:taxonomy_job_change', args=[job.pk]) + return format_html( + '' + 'Open Job: {} (ID: {})', + url, job.name, job.external_id + ) + elif obj.content_type == 'skill': + skill = Skill.objects.get(external_id=obj.external_id) + url = reverse('admin:taxonomy_skill_change', args=[skill.pk]) + return format_html( + '' + 'Open Skill: {} (ID: {})', + url, skill.name, skill.external_id + ) + elif obj.content_type == 'industry': + industry = Industry.objects.get(code=obj.external_id) + url = reverse('admin:taxonomy_industry_change', args=[industry.pk]) + return format_html( + '' + 'Open Industry: {} (Code: {})', + url, industry.name, industry.code + ) + except (Job.DoesNotExist, Skill.DoesNotExist, Industry.DoesNotExist): + return format_html( + 'Source object not found (external_id: {})', + obj.external_id + ) + + return '-' diff --git a/taxonomy/management/commands/populate_taxonomy_translations.py b/taxonomy/management/commands/populate_taxonomy_translations.py new file mode 100644 index 00000000..646b711f --- /dev/null +++ b/taxonomy/management/commands/populate_taxonomy_translations.py @@ -0,0 +1,390 @@ +# -*- coding: utf-8 -*- +""" +Management command to populate taxonomy translations using Xpert AI. + +This command translates job, skill, and industry data from English to target languages. +It uses source_hash to detect changes and avoid unnecessary retranslations. + +Example usage: + python manage.py populate_taxonomy_translations --language es + python manage.py populate_taxonomy_translations --language ar --content-type job + python manage.py populate_taxonomy_translations --language fr --force +""" +import logging + +from django.core.exceptions import ValidationError +from django.core.management.base import BaseCommand, CommandError +from django.db import DatabaseError, IntegrityError + +from taxonomy.models import Industry, Job, Skill, TaxonomyTranslation +from taxonomy.translation_utils import ( + TranslationError, + get_supported_languages, + translate_item_with_xpert, + validate_language_code, +) + +LOGGER = logging.getLogger(__name__) + + +class Command(BaseCommand): + """ + Populate taxonomy translations using Xpert AI. + + This management command translates jobs, skills, and industries from English + to target languages. It intelligently skips translations that are already + up-to-date using MD5 hash comparison of source text. + """ + + help = ( + 'Populate taxonomy translations using Xpert AI. ' + 'Translates jobs, skills, and industries to target languages. ' + 'Uses source_hash to skip unchanged content and avoid unnecessary API calls.' + ) + + def add_arguments(self, parser): + """Add command-line arguments.""" + parser.add_argument( + '--language', + type=str, + required=True, + help=( + 'Target language code (ISO 639-1). ' + 'Supported: {languages}'.format( + languages=', '.join(get_supported_languages()) + ) + ) + ) + + parser.add_argument( + '--content-type', + type=str, + choices=['job', 'skill', 'industry', 'all'], + default='all', + help='Type of content to translate. Default: all' + ) + + parser.add_argument( + '--batch-size', + type=int, + default=100, + help='Number of items to fetch from database in each batch. Default: 100' + ) + + parser.add_argument( + '--force', + action='store_true', + help=( + 'Force retranslation even if source_hash matches. ' + 'Useful when translation quality has improved.' + ) + ) + + def handle(self, *args, **options): + """Execute the command.""" + + # Extract options + language = options['language'] + content_type = options['content_type'] + batch_size = options['batch_size'] + force = options['force'] + + # Validate language code + if not validate_language_code(language): + raise CommandError( + 'Unsupported language code: {language}. ' + 'Supported languages: {supported}'.format( + language=language, + supported=', '.join(get_supported_languages()) + ) + ) + + # Log configuration + LOGGER.info('=' * 60) + LOGGER.info('Taxonomy Translation - Xpert AI') + LOGGER.info('=' * 60) + LOGGER.info('Configuration:') + LOGGER.info(' • Target language: %s', language) + LOGGER.info(' • Content type: %s', content_type) + LOGGER.info(' • Database batch size: %d', batch_size) + LOGGER.info(' • Force retranslation: %s', 'Yes' if force else 'No') + + LOGGER.info( + 'Starting taxonomy translation: language=%s, content_type=%s, batch_size=%d, force=%s', + language, content_type, batch_size, force + ) + + # Initialize statistics + stats = {'translated': 0, 'skipped': 0, 'errors': 0} + + # Translate each content type + if content_type in ['job', 'all']: + job_stats = self.translate_content_type( + model=Job, + content_type_name='job', + language=language, + batch_size=batch_size, + force=force + ) + for key in stats: + stats[key] += job_stats[key] + + if content_type in ['skill', 'all']: + skill_stats = self.translate_content_type( + model=Skill, + content_type_name='skill', + language=language, + batch_size=batch_size, + force=force + ) + for key in stats: + stats[key] += skill_stats[key] + + if content_type in ['industry', 'all']: + industry_stats = self.translate_content_type( + model=Industry, + content_type_name='industry', + language=language, + batch_size=batch_size, + force=force + ) + for key in stats: + stats[key] += industry_stats[key] + + # Log summary + LOGGER.info('=' * 60) + LOGGER.info('Translation Summary') + LOGGER.info('=' * 60) + LOGGER.info('Translated: %d', stats['translated']) + LOGGER.info('Skipped (unchanged): %d', stats['skipped']) + + if stats['errors'] > 0: + LOGGER.error('Errors: %d', stats['errors']) + LOGGER.error('Translation completed with %d errors', stats['errors']) + else: + LOGGER.info('Errors: 0') + + total_processed = stats['translated'] + stats['skipped'] + stats['errors'] + LOGGER.info('Total processed: %d', total_processed) + + LOGGER.info( + 'Translation completed: translated=%d, skipped=%d, errors=%d', + stats['translated'], stats['skipped'], stats['errors'] + ) + + def translate_content_type(self, model, content_type_name, language, batch_size, force): + """ + Translate all entities of a content type to target language. + + This is a generic method that works for Job, Skill, and Industry models. + + Args: + model: Django model class (Job, Skill, or Industry) + content_type_name (str): Content type name ('job', 'skill', 'industry') + language (str): Target language code + batch_size (int): Number of items to fetch from database per batch + force (bool): Force retranslation + + Returns: + dict: Statistics for this content type + """ + LOGGER.info('Starting translation for content_type=%s', content_type_name) + + # Get queryset - exclude items without required fields + queryset = model.objects.exclude(name__isnull=True) + + # For jobs and skills, also exclude those without external_id + if hasattr(model, 'external_id'): + queryset = queryset.exclude(external_id__isnull=True) + + total = queryset.count() + + if total == 0: + LOGGER.info('No %ss found to translate', content_type_name) + return {'translated': 0, 'skipped': 0, 'errors': 0} + + LOGGER.info('Found %d %ss to process', total, content_type_name) + + # Track statistics for this content type + stats = {'translated': 0, 'skipped': 0, 'errors': 0} + + # Process in batches + for i in range(0, total, batch_size): + batch = queryset[i:i + batch_size] + batch_stats = self.process_batch( + entities=batch, + content_type_name=content_type_name, + language=language, + force=force + ) + + stats['translated'] += batch_stats['translated'] + stats['skipped'] += batch_stats['skipped'] + stats['errors'] += batch_stats['errors'] + + LOGGER.info( + 'Processed %d/%d %ss', + min(i + batch_size, total), + total, + content_type_name + ) + + LOGGER.info( + 'Completed translation for content_type=%s: translated=%d, skipped=%d, errors=%d', + content_type_name, + stats['translated'], + stats['skipped'], + stats['errors'] + ) + + return stats + + def process_batch(self, entities, content_type_name, language, force): + """ + Process a batch of entities for translation. + + This is a generic method that works for any entity type (Job, Skill, Industry). + + Args: + entities (QuerySet): Batch of entity objects + content_type_name (str): Content type name + language (str): Target language code + force (bool): Force retranslation + + Returns: + dict: Batch statistics + """ + batch_stats = {'translated': 0, 'skipped': 0, 'errors': 0} + + items_to_translate = [] + + for entity in entities: + # - Job/Skill: use external_id + # - Industry: use code NAICS2 code + if hasattr(entity, 'external_id'): + external_id = entity.external_id + else: + # Industry: use NAICS2 code + external_id = str(entity.code) + + # Get description (not available for Industry) + description = getattr(entity, 'description', '') or '' + + # Calculate source hash + source_hash = TaxonomyTranslation.calculate_source_hash( + entity.name, + description + ) + + # Check if translation needs updating + should_translate, __ = self._should_translate( + external_id=external_id, + content_type=content_type_name, + language=language, + source_hash=source_hash, + force=force + ) + + if not should_translate: + batch_stats['skipped'] += 1 + continue + + items_to_translate.append({ + 'external_id': external_id, + 'title': entity.name, + 'description': description, + 'source_hash': source_hash, + }) + + # Translate items one at a time using Xpert AI + for idx, item in enumerate(items_to_translate, 1): + LOGGER.info( + 'Translating %s %d/%d: %s', + content_type_name, + idx, + len(items_to_translate), + item['external_id'] + ) + + try: + translation = translate_item_with_xpert( + title=item['title'], + description=item['description'], + target_language=language, + content_type=content_type_name, + external_id=item['external_id'] + ) + + # Save translation using update_or_create for atomic operation + TaxonomyTranslation.objects.update_or_create( + external_id=item['external_id'], + content_type=content_type_name, + language_code=language, + defaults={ + 'title': translation.get('title', ''), + 'description': translation.get('description', ''), + 'source_hash': item['source_hash'], + } + ) + + batch_stats['translated'] += 1 + LOGGER.info( + 'Saved translation for %s %s to %s', + content_type_name, + item['external_id'], + language + ) + + except TranslationError as error: + LOGGER.error( + 'Translation failed for %s %s: %s', + content_type_name, + item['external_id'], + str(error) + ) + batch_stats['errors'] += 1 + + except (IntegrityError, ValidationError, DatabaseError) as error: + LOGGER.error( + 'Database error saving translation for %s %s: %s', + content_type_name, + item['external_id'], + str(error), + exc_info=True + ) + batch_stats['errors'] += 1 + + return batch_stats + + def _should_translate(self, external_id, content_type, language, source_hash, force): + """ + Determine if an entity needs translation. + + Args: + external_id (str): External ID of entity + content_type (str): Content type name + language (str): Target language code + source_hash (str): Current source hash + force (bool): Force retranslation + + Returns: + tuple: (should_translate: bool, is_update: bool) + """ + try: + existing = TaxonomyTranslation.objects.get( + external_id=external_id, + content_type=content_type, + language_code=language + ) + + # Translation exists - check if it needs updating + if not force and existing.source_hash == source_hash: + # English hasn't changed, skip + return (False, False) + else: + # English changed or force flag set, need to update + return (True, True) + + except TaxonomyTranslation.DoesNotExist: + # New translation needed + return (True, False) diff --git a/taxonomy/migrations/0039_taxonomytranslation.py b/taxonomy/migrations/0039_taxonomytranslation.py new file mode 100644 index 00000000..3565f09c --- /dev/null +++ b/taxonomy/migrations/0039_taxonomytranslation.py @@ -0,0 +1,36 @@ +# Generated by Django 4.2.16 on 2026-01-27 04:59 + +import django.utils.timezone +from django.db import migrations, models + +import model_utils.fields + + +class Migration(migrations.Migration): + + dependencies = [ + ('taxonomy', '0038_mariadb_uuid_conversion'), + ] + + operations = [ + migrations.CreateModel( + name='TaxonomyTranslation', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', model_utils.fields.AutoCreatedField(default=django.utils.timezone.now, editable=False, verbose_name='created')), + ('modified', model_utils.fields.AutoLastModifiedField(default=django.utils.timezone.now, editable=False, verbose_name='modified')), + ('external_id', models.CharField(db_index=True, help_text='EMSI external ID (e.g., ET1234567890, ES1234567890, IN123)', max_length=255)), + ('content_type', models.CharField(choices=[('job', 'Job'), ('skill', 'Skill'), ('industry', 'Industry')], db_index=True, help_text='Type of entity being translated (job, skill, or industry)', max_length=50)), + ('language_code', models.CharField(db_index=True, help_text='Target language code (e.g., es, ar, fr) following ISO 639-1', max_length=10)), + ('title', models.CharField(help_text='Translated name/title of the entity', max_length=500)), + ('description', models.TextField(blank=True, help_text='Translated description field')), + ('source_hash', models.CharField(help_text='MD5 hash of English source text (title + description). Used to detect when source content changes and retranslation is needed.', max_length=64)), + ], + options={ + 'verbose_name': 'Taxonomy Translation', + 'verbose_name_plural': 'Taxonomy Translations', + 'indexes': [models.Index(fields=['language_code', 'content_type'], name='taxonomy_ta_languag_ce278d_idx'), models.Index(fields=['external_id', 'language_code'], name='taxonomy_ta_externa_b71780_idx')], + 'unique_together': {('external_id', 'content_type', 'language_code')}, + }, + ), + ] diff --git a/taxonomy/models.py b/taxonomy/models.py index 2d0da364..94478147 100644 --- a/taxonomy/models.py +++ b/taxonomy/models.py @@ -4,6 +4,7 @@ """ from __future__ import unicode_literals +import hashlib import logging import uuid @@ -1117,6 +1118,139 @@ class Meta: verbose_name_plural = 'Industries' +class TaxonomyTranslation(TimeStampedModel): + """ + Store translations for taxonomy entities (jobs, skills, industries). + + This model follows the enterprise-catalog ContentTranslation pattern, + using source_hash to track when English source content changes and + retranslation is needed. + + .. no_pii: + """ + + CONTENT_TYPE_CHOICES = [ + ('job', 'Job'), + ('skill', 'Skill'), + ('industry', 'Industry'), + ] + + external_id = models.CharField( + max_length=255, + db_index=True, + help_text=_( + 'EMSI external ID (e.g., ET1234567890, ES1234567890, IN123)' + ) + ) + + content_type = models.CharField( + max_length=50, + choices=CONTENT_TYPE_CHOICES, + db_index=True, + help_text=_( + 'Type of entity being translated (job, skill, or industry)' + ) + ) + + language_code = models.CharField( + max_length=10, + db_index=True, + help_text=_( + 'Target language code (e.g., es, ar, fr) following ISO 639-1' + ) + ) + + # Translated fields + title = models.CharField( + max_length=500, + help_text=_( + 'Translated name/title of the entity' + ) + ) + + description = models.TextField( + blank=True, + help_text=_( + 'Translated description field' + ) + ) + + # Change detection + source_hash = models.CharField( + max_length=64, + help_text=_( + 'MD5 hash of English source text (title + description). ' + 'Used to detect when source content changes and retranslation is needed.' + ) + ) + + class Meta: + """ + Meta options for TaxonomyTranslation. + """ + + app_label = 'taxonomy' + verbose_name = _('Taxonomy Translation') + verbose_name_plural = _('Taxonomy Translations') + unique_together = [('external_id', 'content_type', 'language_code')] + indexes = [ + models.Index(fields=['language_code', 'content_type']), + models.Index(fields=['external_id', 'language_code']), + ] + + def __str__(self): + """ + Return a human-readable string representation. + """ + return '{content_type}:{external_id}:{language}:{title}'.format( + content_type=self.content_type, + external_id=self.external_id, + language=self.language_code, + title=self.title[:50], # Truncate for readability + ) + + def __repr__(self): + """ + Create a unique string representation of the object. + """ + return ''.format( + self.id, + self.content_type, + self.external_id, + self.language_code, + ) + + @staticmethod + def calculate_source_hash(title, description=''): + """ + Calculate MD5 hash of English source text. + + This hash is used to detect when the source content has changed, + which triggers retranslation. The hash combines title and description + with a delimiter to avoid hash collisions. + + Args: + title (str): The title/name of the entity + description (str): The description of the entity (optional) + + Returns: + str: MD5 hash (32 hex characters) + + Example: + >>> TaxonomyTranslation.calculate_source_hash('Software Engineer', 'Develops apps') + 'a1b2c3d4e5f6...' + """ + # Combine title and description with delimiter + # Use || to separate fields (unlikely to appear in actual content) + source_text = '{title}||{description}'.format( + title=title or '', + description=description or '' + ) + + # Calculate MD5 hash + return hashlib.md5(source_text.encode('utf-8')).hexdigest() + + class B2CJobAllowList(models.Model): """ Model for storing admin configuration for B2C Job Allowlist entries. diff --git a/taxonomy/tests/test_populate_taxonomy_translations.py b/taxonomy/tests/test_populate_taxonomy_translations.py new file mode 100644 index 00000000..82200c5f --- /dev/null +++ b/taxonomy/tests/test_populate_taxonomy_translations.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- +""" +Tests for populate_taxonomy_translations management command. +""" +import unittest +from unittest.mock import patch + +import ddt +import pytest + +from django.core.management import call_command +from django.core.management.base import CommandError +from django.db import DatabaseError + +from taxonomy.models import Industry, Job, Skill, TaxonomyTranslation +from taxonomy.translation_utils import TranslationError + + +@pytest.mark.django_db +@ddt.ddt +class TestPopulateTaxonomyTranslationsCommand(unittest.TestCase): + """Test the populate_taxonomy_translations management command.""" + + @ddt.data('job', 'skill', 'industry') + def test_translate_content_type_success(self, content_type): + """Test successful translation for each content type.""" + # Create test data + if content_type == 'job': + Job.objects.create(external_id='ET123', name='Software Engineer', description='Develops apps') + elif content_type == 'skill': + Skill.objects.create(external_id='ES123', name='Python', description='Programming language') + else: + Industry.objects.create(code=54, name='Technology') + + with patch('taxonomy.translation_utils.chat_completion') as mock_chat: + external_id = 'ET123' if content_type == 'job' else ('ES123' if content_type == 'skill' else '54') + mock_chat.return_value = f'{{"title": "Translated", "description": "Desc"}}' + + call_command('populate_taxonomy_translations', '--language', 'es', '--content-type', content_type) + + # Verify translation created + translation = TaxonomyTranslation.objects.get( + external_id=external_id, + content_type=content_type, + language_code='es' + ) + assert translation.title == 'Translated' + assert translation.description == 'Desc' + + @pytest.mark.django_db + def test_skip_unchanged_translations(self): + """Test that unchanged translations are skipped.""" + job = Job.objects.create(external_id='ET123', name='Engineer', description='Develops') + source_hash = TaxonomyTranslation.calculate_source_hash(job.name, job.description) + + # Create existing translation + TaxonomyTranslation.objects.create( + external_id='ET123', + content_type='job', + language_code='es', + title='Ingeniero', + description='Desarrolla', + source_hash=source_hash + ) + + with patch('taxonomy.translation_utils.chat_completion') as mock_chat: + call_command('populate_taxonomy_translations', '--language', 'es', '--content-type', 'job') + + # API should not be called + mock_chat.assert_not_called() + + @pytest.mark.django_db + def test_update_stale_translations(self): + """Test that stale translations are updated.""" + job = Job.objects.create(external_id='ET123', name='Engineer Updated', description='New desc') + + # Create stale translation with old hash + TaxonomyTranslation.objects.create( + external_id='ET123', + content_type='job', + language_code='es', + title='Old Translation', + description='Old desc', + source_hash='old_hash_123' + ) + + with patch('taxonomy.translation_utils.chat_completion') as mock_chat: + mock_chat.return_value = '{"title": "New Translation", "description": "New"}' + + call_command('populate_taxonomy_translations', '--language', 'es', '--content-type', 'job') + + # Translation should be updated + translation = TaxonomyTranslation.objects.get(external_id='ET123', content_type='job', language_code='es') + assert translation.title == 'New Translation' + assert translation.description == 'New' + + @pytest.mark.django_db + def test_force_retranslation(self): + """Test --force flag retranslates even unchanged items.""" + job = Job.objects.create(external_id='ET123', name='Engineer', description='Develops') + source_hash = TaxonomyTranslation.calculate_source_hash(job.name, job.description) + + TaxonomyTranslation.objects.create( + external_id='ET123', + content_type='job', + language_code='es', + title='Ingeniero', + description='Desarrolla', + source_hash=source_hash + ) + + with patch('taxonomy.translation_utils.chat_completion') as mock_chat: + mock_chat.return_value = '{"title": "Forced Translation", "description": "Forced"}' + + call_command('populate_taxonomy_translations', '--language', 'es', '--content-type', 'job', '--force') + + # Translation should be updated despite same hash + translation = TaxonomyTranslation.objects.get(external_id='ET123', content_type='job', language_code='es') + assert translation.title == 'Forced Translation' + + @pytest.mark.django_db + def test_invalid_language_code(self): + """Test command rejects invalid language codes.""" + with pytest.raises(CommandError): + call_command('populate_taxonomy_translations', '--language', 'xyz') + + @pytest.mark.django_db + def test_no_items_to_translate(self): + """Test when there are no items to translate.""" + # Don't create any jobs - empty database + with patch('taxonomy.translation_utils.chat_completion') as mock_chat: + call_command('populate_taxonomy_translations', '--language', 'es', '--content-type', 'job') + + # API should not be called + mock_chat.assert_not_called() + # Should complete without errors + assert TaxonomyTranslation.objects.count() == 0 + + @pytest.mark.django_db + def test_translation_error_handling(self): + """ + Test that command handles TranslationError gracefully. + + When translate_item_with_xpert raises TranslationError (e.g., due to missing title), + the command should catch it, log the error, increment error counter, and continue + without saving the incomplete translation. + """ + # Create test job + Job.objects.create(external_id='ET123', name='Software Engineer', description='Develops apps') + + with patch('taxonomy.translation_utils.chat_completion') as mock_chat: + # Simulate incomplete translation (missing title) which triggers TranslationError + mock_chat.return_value = '{"title": "", "description": "Desc"}' + + # Command should complete and handle the error gracefully + call_command('populate_taxonomy_translations', '--language', 'es', '--content-type', 'job') + + # No translation should be created because TranslationError was caught + assert TaxonomyTranslation.objects.filter(external_id='ET123').count() == 0 + + @pytest.mark.django_db + def test_multiple_translation_errors(self): + """ + Test that command handles multiple TranslationErrors gracefully. + + When multiple items fail to translate (each raising TranslationError), + the command should catch each error, log it, and continue processing + remaining items without crashing. + """ + # Create multiple jobs + Job.objects.create(external_id='ET001', name='Job 1', description='Desc 1') + Job.objects.create(external_id='ET002', name='Job 2', description='Desc 2') + Job.objects.create(external_id='ET003', name='Job 3', description='Desc 3') + + with patch('taxonomy.translation_utils.chat_completion') as mock_chat: + # All translations fail (empty titles trigger TranslationError) + mock_chat.return_value = '{"title": "", "description": "Desc"}' + + # Command should complete despite all errors being caught + call_command('populate_taxonomy_translations', '--language', 'es', '--content-type', 'job') + + # No translations should be created since all raised TranslationError + assert TaxonomyTranslation.objects.count() == 0 + + @pytest.mark.django_db + def test_database_error_handling(self): + """ + Test that command handles database errors gracefully. + + When a DatabaseError occurs during save (after successful translation), + the command should catch it, log the error, increment error counter, + and continue without crashing. + """ + # Create test job + Job.objects.create(external_id='ET123', name='Software Engineer', description='Develops apps') + + with patch('taxonomy.translation_utils.chat_completion') as mock_chat: + mock_chat.return_value = '{"title": "Ingeniero", "description": "Desarrolla"}' + + # Mock update_or_create to raise DatabaseError + with patch('taxonomy.models.TaxonomyTranslation.objects.update_or_create') as mock_save: + mock_save.side_effect = DatabaseError('Database connection failed') + + # Command should complete and handle the database error gracefully + call_command('populate_taxonomy_translations', '--language', 'es', '--content-type', 'job') + + # No translation should be created due to DatabaseError being caught + assert TaxonomyTranslation.objects.filter(external_id='ET123').count() == 0 diff --git a/taxonomy/tests/test_translation_utils.py b/taxonomy/tests/test_translation_utils.py new file mode 100644 index 00000000..34a4eda7 --- /dev/null +++ b/taxonomy/tests/test_translation_utils.py @@ -0,0 +1,206 @@ +# -*- coding: utf-8 -*- +""" +Tests for translation utilities. +""" +import unittest +from unittest.mock import patch + +import ddt +import pytest + +from taxonomy.translation_utils import ( + TranslationError, + _build_translation_prompt, + _parse_translation_response, + get_supported_languages, + translate_item_with_xpert, + validate_language_code, +) + + +@ddt.ddt +class TestTranslationUtils(unittest.TestCase): + """Test translation utility functions.""" + + @ddt.data( + ('es', True), + ('en', False), + ('ar', False), + ('xyz', False), + ) + @ddt.unpack + def test_validate_language_code(self, language_code, expected): + """Test language code validation - only Spanish supported.""" + assert validate_language_code(language_code) == expected + + def test_get_supported_languages(self): + """Test getting supported languages returns only Spanish.""" + languages = get_supported_languages() + assert languages == ['es'] + + def test_build_translation_prompt(self): + """Test building translation prompt for a single item.""" + prompt = _build_translation_prompt( + title='Software Engineer', + description='Develops software', + content_type='job', + target_language='es' + ) + + assert 'Software Engineer' in prompt + assert 'Spanish' in prompt + assert 'Develops software' in prompt + + @ddt.data( + # Valid complete response + ( + '{"title": "Ingeniero", "description": "Desarrolla"}', + {'title': 'Ingeniero', 'description': 'Desarrolla'} + ), + # Response with only title (description defaults to empty) + ( + '{"title": "Ingeniero"}', + {'title': 'Ingeniero', 'description': ''} + ), + # Response with empty description + ( + '{"title": "Ingeniero", "description": ""}', + {'title': 'Ingeniero', 'description': ''} + ), + ) + @ddt.unpack + def test_parse_translation_response_success(self, response, expected): + """Test parsing valid single-item translation responses.""" + result = _parse_translation_response(response) + + assert result['title'] == expected['title'] + assert result['description'] == expected['description'] + + def test_parse_translation_response_missing_description(self): + """Test parsing response with missing description field uses empty string.""" + response = '{"title": "Ingeniero"}' + + result = _parse_translation_response(response) + + assert result['title'] == 'Ingeniero' + assert result['description'] == '' # Missing - fallback + + @ddt.data( + 'This is not JSON', + '{invalid json}', + 'null', + '[]', # Array instead of object + '123', # Number instead of object + ) + def test_parse_translation_response_invalid(self, response): + """Test parsing invalid responses returns empty dict.""" + result = _parse_translation_response(response) + + # Function handles errors gracefully by returning empty dict + assert result == {'title': '', 'description': ''} + + @patch('taxonomy.translation_utils.chat_completion') + def test_translate_item_success(self, mock_chat): + """Test successful single-item translation.""" + mock_chat.return_value = '{"title": "Ingeniero", "description": "Desarrolla"}' + + result = translate_item_with_xpert( + title='Engineer', + description='Develops', + target_language='es', + content_type='job', + external_id='ET123' + ) + + assert result['title'] == 'Ingeniero' + assert result['description'] == 'Desarrolla' + assert mock_chat.call_count == 1 + + @patch('taxonomy.translation_utils.chat_completion') + def test_translate_item_api_error(self, mock_chat): + """Test single-item translation raises TranslationError on API errors.""" + mock_chat.side_effect = Exception('API Error') + + with pytest.raises(TranslationError) as exc_info: + translate_item_with_xpert( + title='Engineer', + description='', + target_language='es', + content_type='job', + external_id='ET123' + ) + + assert 'Failed to translate job ET123' in str(exc_info.value) + assert 'API Error' in str(exc_info.value) + + @patch('taxonomy.translation_utils.chat_completion') + def test_translate_item_missing_title_translation(self, mock_chat): + """Test that TranslationError is raised when title is provided but translation is missing.""" + # Mock returns empty title when we provided a non-empty title + mock_chat.return_value = '{"title": "", "description": "Desarrolla"}' + + with pytest.raises(TranslationError) as exc_info: + translate_item_with_xpert( + title='Engineer', # Non-empty title provided + description='Develops', + target_language='es', + content_type='job', + external_id='ET123' + ) + + assert 'Translation missing title' in str(exc_info.value) + assert 'ET123' in str(exc_info.value) + + @patch('taxonomy.translation_utils.chat_completion') + def test_translate_item_missing_description_translation(self, mock_chat): + """Test that TranslationError is raised when description is provided but translation is missing.""" + # Mock returns empty description when we provided a non-empty description + mock_chat.return_value = '{"title": "Ingeniero", "description": ""}' + + with pytest.raises(TranslationError) as exc_info: + translate_item_with_xpert( + title='Engineer', + description='Develops software applications', # Non-empty description provided + target_language='es', + content_type='job', + external_id='ET123' + ) + + assert 'Translation missing description' in str(exc_info.value) + assert 'ET123' in str(exc_info.value) + + @patch('taxonomy.translation_utils.chat_completion') + def test_translate_item_empty_inputs_no_error(self, mock_chat): + """Test that no error is raised when inputs are empty and translations are empty.""" + # Empty inputs should not trigger validation error + mock_chat.return_value = '{"title": "", "description": ""}' + + result = translate_item_with_xpert( + title='', # Empty title + description='', # Empty description + target_language='es', + content_type='job', + external_id='ET123' + ) + + assert result['title'] == '' + assert result['description'] == '' + # No exception should be raised + + @patch('taxonomy.translation_utils.chat_completion') + def test_translate_item_only_title_provided(self, mock_chat): + """Test translation when only title is provided (description is empty).""" + # Only title provided, description empty - should work fine + mock_chat.return_value = '{"title": "Ingeniero", "description": ""}' + + result = translate_item_with_xpert( + title='Engineer', # Non-empty title + description='', # Empty description + target_language='es', + content_type='job', + external_id='ET123' + ) + + assert result['title'] == 'Ingeniero' + assert result['description'] == '' + # No exception should be raised since description was empty in input diff --git a/taxonomy/translation_utils.py b/taxonomy/translation_utils.py new file mode 100644 index 00000000..8b8a633f --- /dev/null +++ b/taxonomy/translation_utils.py @@ -0,0 +1,245 @@ +# -*- coding: utf-8 -*- +""" +Utilities for translating taxonomy data using Xpert AI. + +This module provides helper functions to translate job, skill, and industry data +from English to other languages using the Xpert AI translation service. +""" +import json +import logging + +from taxonomy.openai.client import chat_completion + +LOGGER = logging.getLogger(__name__) + + +class TranslationError(Exception): + """Exception raised when translation fails or returns incomplete results.""" + + +def translate_item_with_xpert(title, description, target_language, content_type, external_id): + """ + Translate a single taxonomy item using Xpert AI. + + This function takes a taxonomy item's title and description and translates + them from English to the target language using Xpert AI. + + Args: + title (str): English title/name to translate + description (str): English description to translate + target_language (str): Target language code (e.g., 'es', 'ar', 'fr') + content_type (str): Type of content ('job', 'skill', or 'industry') + external_id (str): External ID for logging purposes + + Returns: + dict: Dict containing translated content: + - title (str): Translated title + - description (str): Translated description + + Example: + >>> result = translate_item_with_xpert( + ... title='Software Engineer', + ... description='Develops software applications', + ... target_language='es', + ... content_type='job', + ... external_id='ET123' + ... ) + >>> result['title'] + 'Ingeniero de Software' + """ + LOGGER.debug( + 'Translating %s %s to %s using Xpert AI', + content_type, + external_id, + target_language + ) + + try: + # Build translation prompt + prompt = _build_translation_prompt( + title=title, + description=description, + content_type=content_type, + target_language=target_language + ) + + system_message = "You are a professional translator specializing in career and education content." + + # API call for single item + response = chat_completion( + prompt=prompt, + system_message=system_message + ) + + translation = _parse_translation_response(response) + + # Validate translation completeness + # If title was provided, translation must have title + if title and not translation['title']: + raise TranslationError( + f"Translation missing title for {content_type} {external_id}. " + f"Input title: '{title}'" + ) + + # If description was provided, translation must have description + if description and not translation['description']: + raise TranslationError( + f"Translation missing description for {content_type} {external_id}. " + f"Input description length: {len(description)} chars" + ) + + LOGGER.debug( + 'Successfully translated %s %s to %s', + content_type, + external_id, + target_language + ) + + return translation + + except TranslationError: + # Re-raise validation errors so they can be handled by caller + raise + except Exception as error: # pylint: disable=broad-exception-caught + LOGGER.error( + 'Error translating %s %s: %s', + content_type, + external_id, + str(error), + exc_info=True + ) + # Wrap in TranslationError so caller can handle uniformly + raise TranslationError( + f"Failed to translate {content_type} {external_id}: {str(error)}" + ) from error + + +def _build_translation_prompt(title, description, content_type, target_language): + """ + Build a translation prompt for a single item. + + Creates a structured prompt that instructs the AI to translate a single taxonomy + item (job, skill, or industry) from English to the target language. + + Args: + title (str): Title/name to translate + description (str): Description to translate + content_type (str): Type of content ('job', 'skill', 'industry') + target_language (str): Target language code + + Returns: + str: Formatted prompt for Xpert AI + + Example: + >>> prompt = _build_translation_prompt( + ... title='Software Engineer', + ... description='Develops apps', + ... content_type='job', + ... target_language='es' + ... ) + >>> 'Software Engineer' in prompt + True + """ + # Map language codes to full language names + language_names = { + 'es': 'Spanish', + } + language_name = language_names.get(target_language, target_language) + + prompt = f"""Translate the following {content_type} from English to {language_name}. + +CRITICAL INSTRUCTIONS: +1. Maintain professional tone appropriate for career/education content +2. Preserve technical terms (e.g., "Python", "JavaScript", "SQL", "AWS", "React") +3. If description is empty, return empty string for description +4. Return ONLY a valid JSON object - no explanations, no markdown, just the object +5. The JSON must have exactly two fields: "title" and "description" + +Input to translate: +Title: {title} +Description: {description} + +Return translation in this EXACT format (JSON object): +{{"title": "TRANSLATED_TITLE", "description": "TRANSLATED_DESCRIPTION"}} +""" + + return prompt + + +def _parse_translation_response(response): + """ + Parse translation response for a single item. + + Validates the response contains valid JSON with title and description fields. + + Args: + response (str): Response from Xpert AI API containing JSON object + + Returns: + dict: Translated content with: + - title (str): Translated title (or empty string on error) + - description (str): Translated description (or empty string on error) + + Example: + >>> response = '{"title": "Ingeniero de Software", "description": "Desarrolla aplicaciones"}' + >>> result = _parse_translation_response(response) + >>> result['title'] + 'Ingeniero de Software' + """ + try: + # Parse JSON object + translated = json.loads(response) + except json.JSONDecodeError as e: + LOGGER.error('Failed to parse translation response as JSON: %s', str(e)) + LOGGER.debug('Response content: %s', response[:200]) + return {'title': '', 'description': ''} + + # Validate it's a dict + if not isinstance(translated, dict): + LOGGER.error('Expected JSON object, got %s', type(translated).__name__) + return {'title': '', 'description': ''} + + # Extract and validate fields + title = str(translated.get('title', '')).strip() + description = str(translated.get('description', '')).strip() + + return { + 'title': title, + 'description': description + } + + +def get_supported_languages(): + """ + Get list of supported language codes for translation. + + Currently only Spanish is supported. English is the source language. + + Returns: + list: List of ISO 639-1 language codes + + Example: + >>> languages = get_supported_languages() + >>> 'es' in languages + True + """ + return ['es'] + + +def validate_language_code(language_code): + """ + Validate that a language code is supported. + + Args: + language_code (str): Language code to validate + + Returns: + bool: True if language is supported, False otherwise + + Example: + >>> validate_language_code('es') + True + >>> validate_language_code('xyz') + False + """ + return language_code in get_supported_languages() diff --git a/test_utils/factories.py b/test_utils/factories.py index 5a7a5816..01c106e7 100644 --- a/test_utils/factories.py +++ b/test_utils/factories.py @@ -29,6 +29,7 @@ SkillsQuiz, SkillSubCategory, SkillValidationConfiguration, + TaxonomyTranslation, Translation, XBlockSkillData, XBlockSkills, @@ -418,3 +419,21 @@ class SkillValidationConfigurationFactory(factory.django.DjangoModelFactory): class Meta: model = SkillValidationConfiguration + + +class TaxonomyTranslationFactory(factory.django.DjangoModelFactory): + """ + Factory class for TaxonomyTranslation model. + """ + + class Meta: + model = TaxonomyTranslation + + external_id = factory.Sequence(lambda n: f'ET{n:010d}') + content_type = 'job' + language_code = 'es' + title = factory.Faker('job') + description = factory.Faker('text') + source_hash = factory.LazyAttribute( + lambda obj: TaxonomyTranslation.calculate_source_hash(obj.title, obj.description) + ) diff --git a/tests/test_models.py b/tests/test_models.py index 1439c92d..a620d7ab 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -822,3 +822,30 @@ def test_model_object_str_with_org_key(self): organization = self.courses[0].key.split('+')[0] disabled_config = factories.SkillValidationConfigurationFactory(organization=organization) assert str(disabled_config) == 'Skill validation disabled for organization: RichX' + + +@mark.django_db +class TestTaxonomyTranslation(TestCase): + """ + Tests for the ``TaxonomyTranslation`` model. + """ + + def test_string_representation(self): + """ + Test the string representation of the TaxonomyTranslation model. + """ + translation = factories.TaxonomyTranslationFactory( + external_id='ET', + content_type='job', + language_code='es', + title='In', + description='De' + ) + + expected_str = 'job:ET:es:In' + expected_repr = ''.format( + translation.id + ) + + assert expected_str == translation.__str__() + assert expected_repr == translation.__repr__() diff --git a/tox.ini b/tox.ini index e3510787..75acd8c4 100644 --- a/tox.ini +++ b/tox.ini @@ -79,6 +79,7 @@ setenv = DJANGO_SETTINGS_MODULE = test_settings deps = setuptools + Django>=4.2,<5.3 -r{toxinidir}/requirements/test.txt commands = code_annotations django_find_annotations --config_file .pii_annotations.yml --lint --report --coverage