Skip to content

Commit df422b6

Browse files
committed
Can now encode and decode MarkLogic internal vectors
1 parent 2675b8c commit df422b6

File tree

3 files changed

+95
-2
lines changed

3 files changed

+95
-2
lines changed

marklogic/vector_util.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import base64
2+
import struct
3+
from typing import List
4+
5+
6+
class VectorUtil:
7+
"""
8+
Supports encoding and decoding vectors using the same approach as the vec:base64-encode and vec:base64-decode
9+
functions supported by the MarkLogic server.
10+
"""
11+
12+
@staticmethod
13+
def base64_encode(vector: List[float]) -> str:
14+
"""
15+
Encodes a list of floats as a base64 string compatible with MarkLogic's vec:base64-encode.
16+
"""
17+
dimensions = len(vector)
18+
# version (int32, 0) + dimensions (int32) + floats (little-endian)
19+
buffer = struct.pack("<ii", 0, dimensions) + struct.pack(
20+
"<" + "f" * dimensions, *vector
21+
)
22+
return base64.b64encode(buffer).decode("ascii")
23+
24+
@staticmethod
25+
def base64_decode(encoded_vector: str) -> List[float]:
26+
"""
27+
Decodes a base64 string to a list of floats compatible with MarkLogic's vec:base64-decode.
28+
"""
29+
buffer = base64.b64decode(encoded_vector)
30+
version, dimensions = struct.unpack("<ii", buffer[:8])
31+
if version != 0:
32+
raise ValueError(f"Unsupported vector version: {version}")
33+
floats = struct.unpack(
34+
"<" + "f" * dimensions, buffer[8 : 8 + 4 * dimensions]
35+
)
36+
return list(floats)

test-app/docker-compose.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
name: marklogic_python
1+
# name: docker-tests-marklogic_python
2+
name: docker-tests-marklogic_python-12
23

34
services:
45

56
marklogic:
6-
image: "progressofficial/marklogic-db:11.3.0-ubi"
7+
# image: "progressofficial/marklogic-db:11.3.0-ubi"
8+
image: "ml-docker-db-dev-tierpoint.bed-artifactory.bedford.progress.com/marklogic/marklogic-server-ubi-rootless:latest-12"
79
platform: linux/amd64
810
environment:
911
- INSTALL_CONVERTERS=true

tests/test_vector_util.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import math
2+
import ast
3+
from marklogic.vector_util import VectorUtil
4+
from marklogic import Client
5+
6+
VECTOR = [3.14, 1.59, 2.65]
7+
EXPECTED_BASE64 = "AAAAAAMAAADD9UhAH4XLP5qZKUA="
8+
ACCEPTABLE_DELTA = 0.0001
9+
10+
11+
def test_encode_and_decode_with_python():
12+
encoded = VectorUtil.base64_encode(VECTOR)
13+
assert encoded == EXPECTED_BASE64
14+
15+
decoded = VectorUtil.base64_decode(encoded)
16+
assert len(decoded) == len(VECTOR)
17+
for a, b in zip(decoded, VECTOR):
18+
assert abs(a - b) < ACCEPTABLE_DELTA
19+
20+
21+
def test_decode_known_base64():
22+
decoded = VectorUtil.base64_decode(EXPECTED_BASE64)
23+
assert len(decoded) == len(VECTOR)
24+
for a, b in zip(decoded, VECTOR):
25+
assert abs(a - b) < ACCEPTABLE_DELTA
26+
27+
28+
def test_encode_and_decode_with_server(client: Client):
29+
"""
30+
Encode a vector in Python, decode it on the MarkLogic server, and check the result.
31+
"""
32+
encoded = VectorUtil.base64_encode(VECTOR)
33+
assert encoded == EXPECTED_BASE64
34+
35+
# Use MarkLogic's eval endpoint to decode the vector on the server
36+
xquery = f"vec:base64-decode('{encoded}')"
37+
binary_result = client.eval(xquery=xquery)
38+
float_list = ast.literal_eval(binary_result[0].decode("utf-8"))
39+
assert len(float_list) == len(VECTOR)
40+
for a, b in zip(float_list, VECTOR):
41+
assert math.isclose(a, b, abs_tol=ACCEPTABLE_DELTA)
42+
43+
44+
def test_encode_with_server_and_decode_with_python(client: Client):
45+
"""
46+
Encode a vector on the MarkLogic server, decode it in Python, and check the result.
47+
"""
48+
xquery = "vec:base64-encode(vec:vector((3.14, 1.59, 2.65)))"
49+
encoded = client.eval(xquery=xquery)[0]
50+
assert encoded == EXPECTED_BASE64
51+
52+
decoded = VectorUtil.base64_decode(encoded)
53+
assert len(decoded) == len(VECTOR)
54+
for a, b in zip(decoded, VECTOR):
55+
assert math.isclose(a, b, abs_tol=ACCEPTABLE_DELTA)

0 commit comments

Comments
 (0)