Skip to content

Commit d2befee

Browse files
committed
🚸(backend) use unaccented full name for user search
We have the user full name through OIDC in the database, but the search only used the email field. This change allows to search for a user by their first and/or last name (fix #929). Given that user names are more likely than emails to include diacritics, it unaccents both the query and the database entry for search (fix #1091). It also unaccents for email so that internationalized domain names are managed whether or not the accent is included in the search. An unaccented gin index is added on users full_name an email fields. Using a manual migration because a wrapper around unaccent is necessary to make it IMMUTABLE (cf. https://stackoverflow.com/questions/9063402/ )
1 parent 175d80d commit d2befee

File tree

4 files changed

+178
-8
lines changed

4 files changed

+178
-8
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ and this project adheres to
2323
- ♻️(frontend) preserve @ character when esc is pressed after typing it #1512
2424
- ♻️(frontend) make summary button fixed to remain visible during scroll #1581
2525
- ♻️(frontend) pdf embed use full width #1526
26+
- 🚸(backend) use unaccented full name for user search #1637
2627

2728
### Fixed
2829

src/backend/core/api/viewsets.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""API endpoints"""
2+
23
# pylint: disable=too-many-lines
34

45
import base64
@@ -18,7 +19,7 @@
1819
from django.db import connection, transaction
1920
from django.db import models as db
2021
from django.db.models.expressions import RawSQL
21-
from django.db.models.functions import Left, Length
22+
from django.db.models.functions import Greatest, Left, Length
2223
from django.http import Http404, StreamingHttpResponse
2324
from django.urls import reverse
2425
from django.utils import timezone
@@ -37,6 +38,7 @@
3738
from rest_framework.permissions import AllowAny
3839

3940
from core import authentication, choices, enums, models
41+
from core.api.filters import remove_accents
4042
from core.services.ai_services import AIService
4143
from core.services.collaboration_services import CollaborationService
4244
from core.services.converter_services import (
@@ -188,13 +190,15 @@ def get_queryset(self):
188190
queryset = queryset.exclude(documentaccess__document_id=document_id)
189191

190192
filter_data = filterset.form.cleaned_data
191-
query = filter_data["q"]
193+
query = remove_accents(filter_data["q"])
192194

193195
# For emails, match emails by Levenstein distance to prevent typing errors
194196
if "@" in query:
195197
return (
196198
queryset.annotate(
197-
distance=RawSQL("levenshtein(email::text, %s::text)", (query,))
199+
distance=RawSQL(
200+
"levenshtein(unaccent(email::text), %s::text)", (query,)
201+
)
198202
)
199203
.filter(distance__lte=3)
200204
.order_by("distance", "email")[: settings.API_USERS_LIST_LIMIT]
@@ -203,11 +207,15 @@ def get_queryset(self):
203207
# Use trigram similarity for non-email-like queries
204208
# For performance reasons we filter first by similarity, which relies on an
205209
# index, then only calculate precise similarity scores for sorting purposes
210+
206211
return (
207-
queryset.filter(email__trigram_word_similar=query)
208-
.annotate(similarity=TrigramSimilarity("email", query))
212+
queryset.annotate(
213+
sim_email=TrigramSimilarity("email", query),
214+
sim_name=TrigramSimilarity("full_name", query),
215+
)
216+
.annotate(similarity=Greatest("sim_email", "sim_name"))
209217
.filter(similarity__gt=0.2)
210-
.order_by("-similarity", "email")[: settings.API_USERS_LIST_LIMIT]
218+
.order_by("-similarity")[: settings.API_USERS_LIST_LIMIT]
211219
)
212220

213221
@drf.decorators.action(
@@ -1970,8 +1978,7 @@ def get_queryset(self):
19701978
)
19711979
# Abilities are computed based on logged-in user's role and
19721980
# the user role on each document access
1973-
.annotate(user_roles=db.Subquery(user_roles_query))
1974-
.distinct()
1981+
.annotate(user_roles=db.Subquery(user_roles_query)).distinct()
19751982
)
19761983
return queryset
19771984

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Generated by Django 5.2.8 on 2025-11-20 09:56
2+
3+
from django.db import migrations
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
("core", "0026_comments"),
9+
]
10+
11+
operations = [
12+
migrations.RunSQL(
13+
sql="""
14+
CREATE OR REPLACE FUNCTION public.immutable_unaccent(regdictionary, text)
15+
RETURNS text
16+
LANGUAGE c IMMUTABLE PARALLEL SAFE STRICT AS
17+
'$libdir/unaccent', 'unaccent_dict';
18+
19+
CREATE OR REPLACE FUNCTION public.f_unaccent(text)
20+
RETURNS text
21+
LANGUAGE sql IMMUTABLE PARALLEL SAFE STRICT
22+
RETURN public.immutable_unaccent(regdictionary 'public.unaccent', $1);
23+
24+
CREATE INDEX IF NOT EXISTS user_email_unaccent_trgm_idx
25+
ON impress_user
26+
USING gin (f_unaccent(email) gin_trgm_ops);
27+
28+
CREATE INDEX IF NOT EXISTS user_full_name_unaccent_trgm_idx
29+
ON impress_user
30+
USING gin (f_unaccent(full_name) gin_trgm_ops);
31+
""",
32+
reverse_sql="""
33+
DROP INDEX IF EXISTS user_email_unaccent_trgm_idx;
34+
DROP INDEX IF EXISTS user_full_name_unaccent_trgm_idx;
35+
""",
36+
),
37+
]

src/backend/core/tests/test_api_users.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,131 @@ def test_api_users_list_query_email():
7676
assert user_ids == []
7777

7878

79+
def test_api_users_list_query_email_with_internationalized_domain_names():
80+
"""
81+
Authenticated users should be able to list users and filter by email.
82+
It should work even if the email address contains an internationalized domain name.
83+
"""
84+
user = factories.UserFactory()
85+
86+
client = APIClient()
87+
client.force_login(user)
88+
89+
jean = factories.UserFactory(email="jean.martin@éducation.fr")
90+
marie = factories.UserFactory(email="[email protected]")
91+
kurokawa = factories.UserFactory(email="contact@黒川.日本")
92+
93+
response = client.get("/api/v1.0/users/[email protected]")
94+
assert response.status_code == 200
95+
user_ids = [user["id"] for user in response.json()]
96+
assert user_ids == [str(jean.id)]
97+
98+
response = client.get("/api/v1.0/users/?q=jean.martin@éducation.fr")
99+
assert response.status_code == 200
100+
user_ids = [user["id"] for user in response.json()]
101+
assert user_ids == [str(jean.id)]
102+
103+
response = client.get("/api/v1.0/users/[email protected]")
104+
assert response.status_code == 200
105+
user_ids = [user["id"] for user in response.json()]
106+
assert user_ids == [str(marie.id)]
107+
108+
response = client.get("/api/v1.0/users/?q=marie.durand@éducation.fr")
109+
assert response.status_code == 200
110+
user_ids = [user["id"] for user in response.json()]
111+
assert user_ids == [str(marie.id)]
112+
113+
response = client.get("/api/v1.0/users/?q=contact@黒川.日本")
114+
assert response.status_code == 200
115+
user_ids = [user["id"] for user in response.json()]
116+
assert user_ids == [str(kurokawa.id)]
117+
118+
119+
def test_api_users_list_query_full_name():
120+
"""
121+
Authenticated users should be able to list users and filter by full name.
122+
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
123+
"""
124+
user = factories.UserFactory()
125+
126+
client = APIClient()
127+
client.force_login(user)
128+
129+
dave = factories.UserFactory(email="[email protected]", full_name="David Bowman")
130+
131+
response = client.get(
132+
"/api/v1.0/users/?q=David",
133+
)
134+
assert response.status_code == 200
135+
user_ids = [user["id"] for user in response.json()]
136+
assert user_ids == [str(dave.id)]
137+
138+
response = client.get("/api/v1.0/users/?q=Bowman")
139+
assert response.status_code == 200
140+
user_ids = [user["id"] for user in response.json()]
141+
assert user_ids == [str(dave.id)]
142+
143+
response = client.get("/api/v1.0/users/?q=bowman")
144+
assert response.status_code == 200
145+
user_ids = [user["id"] for user in response.json()]
146+
assert user_ids == [str(dave.id)]
147+
148+
response = client.get("/api/v1.0/users/?q=BOWMAN")
149+
assert response.status_code == 200
150+
user_ids = [user["id"] for user in response.json()]
151+
assert user_ids == [str(dave.id)]
152+
153+
response = client.get("/api/v1.0/users/?q=BoWmAn")
154+
assert response.status_code == 200
155+
user_ids = [user["id"] for user in response.json()]
156+
assert user_ids == [str(dave.id)]
157+
158+
response = client.get("/api/v1.0/users/?q=Bovin")
159+
assert response.status_code == 200
160+
user_ids = [user["id"] for user in response.json()]
161+
assert user_ids == []
162+
163+
164+
def test_api_users_list_query_accented_full_name():
165+
"""
166+
Authenticated users should be able to list users and filter by full name with accents.
167+
Only results with a Trigram similarity greater than 0.2 with the query should be returned.
168+
"""
169+
user = factories.UserFactory()
170+
171+
client = APIClient()
172+
client.force_login(user)
173+
174+
fred = factories.UserFactory(
175+
email="[email protected]", full_name="Frédérique Lefèvre"
176+
)
177+
178+
response = client.get("/api/v1.0/users/?q=Frédérique")
179+
assert response.status_code == 200
180+
user_ids = [user["id"] for user in response.json()]
181+
assert user_ids == [str(fred.id)]
182+
183+
response = client.get("/api/v1.0/users/?q=Frederique")
184+
assert response.status_code == 200
185+
user_ids = [user["id"] for user in response.json()]
186+
assert user_ids == [str(fred.id)]
187+
188+
response = client.get("/api/v1.0/users/?q=Lefèvre")
189+
assert response.status_code == 200
190+
user_ids = [user["id"] for user in response.json()]
191+
assert user_ids == [str(fred.id)]
192+
193+
response = client.get("/api/v1.0/users/?q=Lefevre")
194+
assert response.status_code == 200
195+
user_ids = [user["id"] for user in response.json()]
196+
assert user_ids == [str(fred.id)]
197+
198+
response = client.get("/api/v1.0/users/?q=François Lorfebvre")
199+
assert response.status_code == 200
200+
users = [user["full_name"] for user in response.json()]
201+
assert users == []
202+
203+
79204
def test_api_users_list_limit(settings):
80205
"""
81206
Authenticated users should be able to list users and the number of results

0 commit comments

Comments
 (0)