Skip to content

Commit 710bbf5

Browse files
sampaccoudlunika
authored andcommitted
✨(backend) add util to extract text from Ydoc content
Documents content is stored in the Ydoc format. We need a util to extract it as xml/text.
1 parent 747ca70 commit 710bbf5

File tree

7 files changed

+123
-1
lines changed

7 files changed

+123
-1
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ and this project adheres to
132132

133133
## Added
134134

135+
- ⚗️(backend) add util to extract text from base64 yjs document
135136
- ✨(backend) add soft delete and restore API endpoints to documents #516
136137
- ✨(backend) allow organizing documents in a tree structure #516
137138
- ✨(backend) add "excerpt" field to document list serializer #516

Dockerfile

+7
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ FROM base AS back-builder
1515

1616
WORKDIR /builder
1717

18+
# Install Rust and Cargo using Alpine's package manager
19+
RUN apk add --no-cache \
20+
build-base \
21+
libffi-dev \
22+
rust \
23+
cargo
24+
1825
# Copy required python dependencies
1926
COPY ./src/backend /builder
2027

src/backend/core/factories.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,22 @@
1313

1414
fake = Faker()
1515

16+
YDOC_HELLO_WORLD_BASE64 = (
17+
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
18+
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
19+
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
20+
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
21+
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
22+
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
23+
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
24+
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
25+
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
26+
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
27+
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
28+
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
29+
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
30+
)
31+
1632

1733
class UserFactory(factory.django.DjangoModelFactory):
1834
"""A factory to random users for testing purposes."""
@@ -75,7 +91,7 @@ class Meta:
7591

7692
title = factory.Sequence(lambda n: f"document{n}")
7793
excerpt = factory.Sequence(lambda n: f"excerpt{n}")
78-
content = factory.Sequence(lambda n: f"content{n}")
94+
content = YDOC_HELLO_WORLD_BASE64
7995
creator = factory.SubFactory(UserFactory)
8096
deleted_at = None
8197
link_reach = factory.fuzzy.FuzzyChoice(

src/backend/core/tests/test_utils.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Test util base64_yjs_to_text."""
2+
3+
from core import utils
4+
5+
# This base64 string is an example of what is saved in the database.
6+
# This base64 is generated from the blocknote editor, it contains
7+
# the text \n# *Hello* \n- w**or**ld
8+
TEST_BASE64_STRING = (
9+
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
10+
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
11+
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
12+
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
13+
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
14+
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
15+
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
16+
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
17+
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
18+
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
19+
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
20+
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
21+
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
22+
)
23+
24+
25+
def test_utils_base64_yjs_to_text():
26+
"""Test extract text from saved yjs document"""
27+
assert utils.base64_yjs_to_text(TEST_BASE64_STRING) == "Hello world"
28+
29+
30+
def test_utils_base64_yjs_to_xml():
31+
"""Test extract xml from saved yjs document"""
32+
content = utils.base64_yjs_to_xml(TEST_BASE64_STRING)
33+
assert (
34+
'<heading "level"="1" "textAlignment"="left">Hello</heading>' in content
35+
or '<heading "textAlignment"="left" "level"="1">Hello</heading>' in content
36+
)
37+
assert '<bulletListItem "textAlignment"="left">world</bulletListItem>' in content
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Test util base64_yjs_to_text."""
2+
3+
from core.utils import base64_yjs_to_text
4+
5+
6+
def test_base64_yjs_to_text():
7+
"""
8+
Test extract_text_from_saved_yjs_document
9+
This base64 string is an example of what is saved in the database.
10+
This base64 is generated from the blocknote editor, it contains
11+
the text \n# *Hello* \n- w**or**ld
12+
"""
13+
base64_string = (
14+
"AR717vLVDgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcA9e7y1Q4AAw5ibG9ja0NvbnRh"
15+
"aW5lcgcA9e7y1Q4BAwdoZWFkaW5nBwD17vLVDgIGBgD17vLVDgMGaXRhbGljAnt9hPXu8tUOBAVI"
16+
"ZWxsb4b17vLVDgkGaXRhbGljBG51bGwoAPXu8tUOAg10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y"
17+
"1Q4CBWxldmVsAX0BKAD17vLVDgECaWQBdyQwNGQ2MjM0MS04MzI2LTQyMzYtYTA4My00ODdlMjZm"
18+
"YWQyMzAoAPXu8tUOAQl0ZXh0Q29sb3IBdwdkZWZhdWx0KAD17vLVDgEPYmFja2dyb3VuZENvbG9y"
19+
"AXcHZGVmYXVsdIf17vLVDgEDDmJsb2NrQ29udGFpbmVyBwD17vLVDhADDmJ1bGxldExpc3RJdGVt"
20+
"BwD17vLVDhEGBAD17vLVDhIBd4b17vLVDhMEYm9sZAJ7fYT17vLVDhQCb3KG9e7y1Q4WBGJvbGQE"
21+
"bnVsbIT17vLVDhcCbGQoAPXu8tUOEQ10ZXh0QWxpZ25tZW50AXcEbGVmdCgA9e7y1Q4QAmlkAXck"
22+
"ZDM1MWUwNjgtM2U1NS00MjI2LThlYTUtYWJiMjYzMTk4ZTJhKAD17vLVDhAJdGV4dENvbG9yAXcH"
23+
"ZGVmYXVsdCgA9e7y1Q4QD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSH9e7y1Q4QAw5ibG9ja0Nv"
24+
"bnRhaW5lcgcA9e7y1Q4eAwlwYXJhZ3JhcGgoAPXu8tUOHw10ZXh0QWxpZ25tZW50AXcEbGVmdCgA"
25+
"9e7y1Q4eAmlkAXckODk3MDBjMDctZTBlMS00ZmUwLWFjYTItODQ5MzIwOWE3ZTQyKAD17vLVDh4J"
26+
"dGV4dENvbG9yAXcHZGVmYXVsdCgA9e7y1Q4eD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHQA"
27+
)
28+
29+
assert base64_yjs_to_text(base64_string) == "Hello world"

src/backend/core/utils.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Utils for the core app."""
2+
3+
import base64
4+
5+
import y_py as Y
6+
from bs4 import BeautifulSoup
7+
8+
9+
def base64_yjs_to_xml(base64_string):
10+
"""Extract xml from base64 yjs document."""
11+
12+
decoded_bytes = base64.b64decode(base64_string)
13+
uint8_array = bytearray(decoded_bytes)
14+
15+
doc = Y.YDoc() # pylint: disable=E1101
16+
Y.apply_update(doc, uint8_array) # pylint: disable=E1101
17+
return str(doc.get_xml_element("document-store"))
18+
19+
20+
def base64_yjs_to_text(base64_string):
21+
"""Extract text from base64 yjs document."""
22+
23+
blocknote_structure = base64_yjs_to_xml(base64_string)
24+
soup = BeautifulSoup(blocknote_structure, "html.parser")
25+
return soup.get_text(separator=" ").strip()

src/backend/pyproject.toml

+7
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ license = { file = "LICENSE" }
2525
readme = "README.md"
2626
requires-python = ">=3.12"
2727
dependencies = [
28+
"beautifulsoup4==4.12.3",
2829
"boto3==1.37.18",
2930
"Brotli==1.1.0",
3031
"celery[redis]==5.4.0",
@@ -47,6 +48,7 @@ dependencies = [
4748
"gunicorn==23.0.0",
4849
"jsonschema==4.23.0",
4950
"markdown==3.7",
51+
"mozilla-django-oidc==4.0.1",
5052
"nested-multipart-parser==1.5.0",
5153
"openai==1.68.2",
5254
"psycopg[binary]==3.2.6",
@@ -55,8 +57,13 @@ dependencies = [
5557
"requests==2.32.3",
5658
"sentry-sdk==2.24.0",
5759
"url-normalize==1.4.3",
60+
<<<<<<< HEAD
5861
"whitenoise==6.9.0",
5962
"mozilla-django-oidc==4.0.1",
63+
=======
64+
"whitenoise==6.8.2",
65+
"y-py==0.6.2",
66+
>>>>>>> f087cd70 (✨(backend) add util to extract text from Ydoc content)
6067
]
6168

6269
[project.urls]

0 commit comments

Comments
 (0)