Skip to content
91 changes: 69 additions & 22 deletions src/openedx_tagging/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from __future__ import annotations

import logging
import operator
import re
from functools import reduce
from typing import List, Self

from django.core.exceptions import ValidationError
from django.db import models
from django.db.models import F, Value
from django.db.models.functions import Concat, Length, Replace, Substr
from django.db.models import Count, F, IntegerField, Q, Subquery, Value
from django.db.models.functions import Coalesce, Concat, Length, Replace, Substr
from django.utils.functional import cached_property
from django.utils.module_loading import import_string
from django.utils.translation import gettext_lazy as _
Expand Down Expand Up @@ -539,16 +541,7 @@ def _get_filtered_tags_one_level(
qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id")
qs = qs.order_by("value")
if include_counts:
# We need to include the count of how many times this tag is used to tag objects.
# You'd think we could just use:
# qs = qs.annotate(usage_count=models.Count("objecttag__pk"))
# but that adds another join which starts creating a cross product and the children and usage_count become
# intertwined and multiplied with each other. So we use a subquery.
obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate(
# We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
count=models.Func(F('id'), function='Count')
)
qs = qs.annotate(usage_count=models.Subquery(obj_tags.values('count')))
qs = self._add_counts_query(qs)
return qs # type: ignore[return-value]

def _get_filtered_tags_deep(
Expand Down Expand Up @@ -644,18 +637,72 @@ def _get_filtered_tags_deep(
# ordering by it gives the tree sort order that we want.
qs = qs.order_by("lineage")
if include_counts:
# Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level()
obj_tags = (
ObjectTag.objects.filter(tag_id=models.OuterRef("pk"))
.order_by()
.annotate(
# We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
count=models.Func(F("id"), function="Count")
)
)
qs = qs.annotate(usage_count=models.Subquery(obj_tags.values("count")))
qs = self._add_counts_query(qs)

return qs # type: ignore[return-value]

def _add_counts_query(self, qs: models.QuerySet) -> models.QuerySet:
"""
Adds a subquery to the passed-in queryset that returns the usage_count
for a given tag, or the appropriate count with deduplication per Object
for the parents of a used child tag.

The ``qs`` argument is the QuerySet to annotate with usage counts, and
the returned queryset is annotated with those usage counts.
"""
# Adds a subquery to the passed-in queryset that returns the number
# of times a tag has been used.
#
# Note: The count is not a simple count, we need to do a 'roll up'
# where we count the number of times a tag is directly used and applied,
# but then that also needs to add a "1" count to the lineage tags
# (parent, grandparent, etc.), but de-duplicate counts for any children
# so that if we have "2" child tags, it only counts towards "1" for the
# parent.
# This query gets the raw counts for each tag usage, gets the distinct
# usages (so de-duplicates counts) by actual application to an "Object"
# (library, course, course module, course section, etc.), which creates
# a count per tag, annotated to that particular tag from the passed-in
# queryset.

# Since Depth may be variable based on the taxonomy, we dynamically build
# a list of lineage paths to be used in the query, so we're not hard coding to
# a certain number of levels. This will query for the max depth, then build
# an array containing something like:
# ['tag_id', 'tag__parent_id', 'tag__parent__parent_id', 'tag__parent__parent__parent_id', ...]
max_depth = Tag.objects.aggregate(models.Max("depth", default=0))["depth__max"]
lineage_paths = [f"tag{'__parent' * i}_id" for i in range(max_depth + 1)]

# Combine the above-built lineage with a Q query against the OuterRef("pk"),
lineage_query_list = [Q(**{path: models.OuterRef("pk")}) for path in lineage_paths]

usage_count_qs = ObjectTag.objects.filter(
# Combine the logic built above with an or operator to flesh out a
# lineage query of the form:
# ```
# Q(tag_id=OuterRef('pk')) |
# Q(tag__parent_id=OuterRef('pk')) |
# Q(tag__parent__parent_id=OuterRef('pk')) |
# ...
# ```
# Previously the above was hard coded and needed to be changed with every
# change in TAXONOMY_MAX_DEPTH, now it is built dynamically

reduce(operator.or_, lineage_query_list)
).values('object_id').distinct().annotate(
intermediate_grouping=Value(1, output_field=IntegerField())
).values('intermediate_grouping').annotate(
total_usage=Count('object_id', distinct=True)
).values('total_usage')

qs = qs.annotate(
usage_count=Coalesce(
Subquery(usage_count_qs, output_field=IntegerField()),
0 # Coalesce ensures we return 0 instead of None if there are no usages
)
)
return qs

def add_tag(
self,
tag_value: str,
Expand Down
22 changes: 11 additions & 11 deletions tests/openedx_tagging/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,43 +755,43 @@ def get_object_tags():
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "cha" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "cha" but a child does
" Archaebacteria (used: 1, children: 0)",
]),
("ar", [
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "ar" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "ar" but a child does
" Archaebacteria (used: 1, children: 0)",
"Eukaryota (used: 0, children: 1 + 2)",
" Animalia (used: 1, children: 2)", # does not contain "ar" but a child does
"Eukaryota (used: 6, children: 1 + 2)",
" Animalia (used: 4, children: 2)", # does not contain "ar" but a child does
" Arthropoda (used: 1, children: 0)",
" Cnidaria (used: 0, children: 0)",
]),
("aE", [
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "ae" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "ae" but a child does
" Archaebacteria (used: 1, children: 0)",
"Eukaryota (used: 0, children: 1)", # does not contain "ae" but a child does
"Eukaryota (used: 6, children: 1)", # does not contain "ae" but a child does
" Plantae (used: 1, children: 0)",
]),
("a", [
"Archaea (used: 1, children: 3)",
" DPANN (used: 0, children: 0)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 2)",
"Bacteria (used: 1, children: 2)",
" Archaebacteria (used: 1, children: 0)",
" Eubacteria (used: 0, children: 0)",
"Eukaryota (used: 0, children: 4 + 8)",
" Animalia (used: 1, children: 7 + 1)",
"Eukaryota (used: 6, children: 4 + 8)",
" Animalia (used: 4, children: 7 + 1)",
" Arthropoda (used: 1, children: 0)",
" Chordata (used: 0, children: 1)",
" Chordata (used: 0, children: 1)", # <<< Chordata has a matching child but we only support searching
" Mammalia (used: 0, children: 0)",
" Cnidaria (used: 0, children: 0)",
" Cnidaria (used: 0, children: 0)", # 3 levels deep at once for now.
" Ctenophora (used: 0, children: 0)",
" Gastrotrich (used: 1, children: 0)",
" Placozoa (used: 1, children: 0)",
Expand Down
Loading