Skip to content

Commit 5aafd58

Browse files
zqxjjjQianxi Zhang
authored andcommitted
Set up CI with Azure Pipelines and Update Readme
1 parent 12a5a3a commit 5aafd58

File tree

139 files changed

+11107246
-368
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

139 files changed

+11107246
-368
lines changed

.github/workflows/codeql.yml

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# For most projects, this workflow file will not need changing; you simply need
2+
# to commit it to your repository.
3+
#
4+
# You may wish to alter this file to override the set of languages analyzed,
5+
# or to provide custom queries or build logic.
6+
#
7+
# ******** NOTE ********
8+
# We have attempted to detect the languages in your repository. Please check
9+
# the `language` matrix defined below to confirm you have the correct set of
10+
# supported CodeQL languages.
11+
#
12+
name: "CodeQL"
13+
14+
on:
15+
push:
16+
branches: [ "main" ]
17+
pull_request:
18+
# The branches below must be a subset of the branches above
19+
branches: [ "main" ]
20+
schedule:
21+
- cron: '16 4 * * 3'
22+
23+
jobs:
24+
analyze:
25+
name: Analyze
26+
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
27+
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
28+
permissions:
29+
actions: read
30+
contents: read
31+
security-events: write
32+
33+
strategy:
34+
fail-fast: false
35+
matrix:
36+
language: [ 'cpp' ]
37+
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby', 'swift' ]
38+
# Use only 'java' to analyze code written in Java, Kotlin or both
39+
# Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
40+
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
41+
42+
steps:
43+
- name: Checkout repository
44+
uses: actions/checkout@v3
45+
46+
# Initializes the CodeQL tools for scanning.
47+
- name: Initialize CodeQL
48+
uses: github/codeql-action/init@v2
49+
with:
50+
languages: ${{ matrix.language }}
51+
# If you wish to specify custom queries, you can do so here or in a config file.
52+
# By default, queries listed here will override any specified in a config file.
53+
# Prefix the list here with "+" to use these queries and those in the config file.
54+
55+
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
56+
# queries: security-extended,security-and-quality
57+
58+
59+
# Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
60+
# If this step fails, then you should remove it and run the build manually (see below)
61+
#- name: Autobuild
62+
# uses: github/codeql-action/autobuild@v2
63+
64+
# ℹ️ Command-line programs to run using the OS shell.
65+
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
66+
67+
# If the Autobuild fails above, remove it and uncomment the following three lines.
68+
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
69+
- name: Build
70+
- run: |
71+
echo "Run, Build Application using script"
72+
git submodule update --init --recursive
73+
./scripts/patch.sh
74+
75+
- name: Perform CodeQL Analysis
76+
uses: github/codeql-action/analyze@v2
77+
with:
78+
category: "/language:${{matrix.language}}"

.github/workflows/docker-image.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: Docker Image CI
2+
3+
on:
4+
push:
5+
branches: [ "main" ]
6+
pull_request:
7+
branches: [ "main" ]
8+
9+
jobs:
10+
11+
build:
12+
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- uses: actions/checkout@v3
17+
- name: Build the Docker image
18+
run: |
19+
git submodule update --init --recursive
20+
./scripts/patch.sh
21+
docker build . --file Dockerfile --tag MSVBASE:$(date +%s)

CMakeLists.txt

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Copyright (c) Microsoft Corporation. All rights reserved.
2+
# Licensed under the MIT License.
3+
14
cmake_minimum_required(VERSION 3.14.0)
25
project(vectordb VERSION 0.1.0)
36

@@ -84,6 +87,8 @@ add_library(${PROJECT_NAME} SHARED
8487
src/model_mng.cpp
8588
src/topk.cpp
8689
src/multicol_topk.cpp
90+
src/spannindex.cpp
91+
src/spannindex_scan.cpp
8792
)
8893

8994
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_14)
@@ -111,28 +116,29 @@ endif(WIN32)
111116

112117
## Find PostgreSQL Paths
113118

114-
foreach(suffix ${PostgreSQL_KNOWN_VERSIONS})
115-
if(WIN32)
116-
list(APPEND PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES
117-
"PostgreSQL/${suffix}/bin")
118-
endif()
119-
if(UNIX)
120-
list(APPEND PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES
121-
"postgresql${suffix}"
122-
"postgresql/${suffix}"
123-
"pgsql-${suffix}/bin")
124-
endif()
125-
endforeach()
126-
find_program(PostgreSQL_PG_CONFIG pg_config
127-
NAMES ${ARGN}
128-
PATHS
129-
${PostgreSQL_ROOT_DIRECTORIES}
130-
PATH_SUFFIXES
131-
bin
132-
${PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES}
119+
#foreach(suffix ${PostgreSQL_KNOWN_VERSIONS})
120+
# if(WIN32)
121+
# list(APPEND PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES
122+
# "PostgreSQL/${suffix}/bin")
123+
# endif()
124+
# if(UNIX)
125+
# list(APPEND PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES
126+
# "postgresql${suffix}"
127+
# "postgresql/${suffix}"
128+
# "pgsql-${suffix}/bin")
129+
# endif()
130+
#endforeach()
131+
#find_program(PostgreSQL_PG_CONFIG pg_config
132+
#NAMES ${ARGN}
133+
#PATHS
134+
# ${PostgreSQL_ROOT_DIRECTORIES}
135+
#PATH_SUFFIXES
136+
# bin
137+
# ${PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES}
133138
# Help the user find it if we cannot.
134-
DOC "Set the PostgreSQL_BINARY_DIR cmake cache entry to the top-level directory containing the PostgreSQL binaries."
135-
)
139+
#DOC "Set the PostgreSQL_BINARY_DIR cmake cache entry to the top-level directory containing the PostgreSQL binaries."
140+
#)
141+
set(PostgreSQL_PG_CONFIG /u01/app/postgres/product/13.4/bin/pg_config)
136142
if (NOT PostgreSQL_PG_CONFIG)
137143
message(FATAL_ERROR "Unable to find 'pg_config'")
138144
endif ()

Dockerfile

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
FROM debian:bullseye-slim
2-
1+
#FROM debian:bullseye-slim
2+
FROM gcc:12.3.0
33
# make the "en_US.UTF-8" locale so postgres will be utf-8 enabled by default
44
ENV LANG en_US.utf8
55
ENV PG_MAJOR 13
@@ -24,7 +24,7 @@ RUN set -ex \
2424
procps \
2525
sysstat \
2626
libldap2-dev \
27-
python-dev \
27+
python3-dev \
2828
libreadline-dev \
2929
libssl-dev \
3030
bison \
@@ -37,49 +37,49 @@ RUN set -ex \
3737
make \
3838
gcc \
3939
unzip \
40-
python \
40+
python3 \
4141
locales \
4242
wget \
4343
\
44-
&& rm -rf /var/lib/apt/lists/* \
44+
# && rm -rf /var/lib/apt/lists/* \
4545
&& localedef -i en_US -c -f UTF-8 en_US.UTF-8
4646

4747
RUN apt-get update && \
48-
apt-get install -y --no-install-recommends wget git golang-go python-dev swig vim\
48+
apt-get install -y --no-install-recommends wget git golang-go swig vim\
4949
libboost-filesystem-dev libboost-test-dev libboost-serialization-dev libboost-regex-dev libboost-serialization-dev libboost-regex-dev libboost-thread-dev libboost-system-dev
5050

51-
RUN wget "https://boostorg.jfrog.io/artifactory/main/release/1.71.0/source/boost_1_71_0.tar.gz" --no-check-certificate -q -O - \
51+
RUN wget "https://boostorg.jfrog.io/artifactory/main/release/1.81.0/source/boost_1_81_0.tar.gz" --no-check-certificate -q -O - \
5252
| tar -xz && \
53-
cd boost_1_71_0 && \
53+
cd boost_1_81_0 && \
5454
./bootstrap.sh && \
5555
./b2 install && \
5656
ldconfig && \
57-
cd .. && rm -rf boost_1_71_0
57+
cd .. && rm -rf boost_1_81_0
5858

5959
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.tar.gz" --no-check-certificate -q -O - \
6060
| tar -xz --strip-components=1 -C /usr/local
6161

62-
RUN apt-get install -y software-properties-common && add-apt-repository 'deb http://archive.debian.org/debian stretch stretch-security main contrib non-free' && apt-get update && apt-get install -y openjdk-8-jdk
62+
#RUN apt-get install -y software-properties-common && add-apt-repository 'deb http://archive.debian.org/debian stretch stretch-security main contrib non-free' && apt-get update && apt-get install -y openjdk-8-jdk
6363

64-
RUN apt install -y pip && pip install numpy && pip install pandas
64+
#RUN apt install -y pip && pip install numpy && pip install pandas
6565

6666
RUN git config --global http.sslverify false
6767

68-
RUN wget http://ftp.gnu.org/gnu/gcc/gcc-12.2.0/gcc-12.2.0.tar.gz \
69-
&& tar -zxvf gcc-12.2.0.tar.gz \
70-
&& cd gcc-12.2.0 \
71-
&& ./contrib/download_prerequisites \
72-
&& mkdir build \
73-
&& cd build/ \
74-
&& ../configure -enable-checking=release -enable-languages=c,c++ -disable-multilib \
75-
&& make -j$(nproc) \
76-
&& make install \
77-
&& rm /usr/bin/gcc \
78-
&& ln -s /usr/local/bin/gcc /usr/bin/gcc \
79-
&& rm /usr/bin/g++ \
80-
&& ln -s /usr/local/bin/g++ /usr/bin/g++ \
81-
&& rm /usr/lib/x86_64-linux-gnu/libstdc++.so.6 \
82-
&& ln -s /usr/local/lib64/libstdc++.so.6.0.30 /usr/lib/x86_64-linux-gnu/libstdc++.so.6
68+
#RUN wget http://ftp.gnu.org/gnu/gcc/gcc-12.2.0/gcc-12.2.0.tar.gz \
69+
# && tar -zxvf gcc-12.2.0.tar.gz \
70+
# && cd gcc-12.2.0 \
71+
# && ./contrib/download_prerequisites \
72+
# && mkdir build \
73+
# && cd build/ \
74+
# && ../configure -enable-checking=release -enable-languages=c,c++ -disable-multilib \
75+
# && make -j4 \
76+
# && make install \
77+
# && rm /usr/bin/gcc \
78+
# && ln -s /usr/local/bin/gcc /usr/bin/gcc \
79+
# && rm /usr/bin/g++ \
80+
# && ln -s /usr/local/bin/g++ /usr/bin/g++ \
81+
# && rm /usr/lib/x86_64-linux-gnu/libstdc++.so.6 \
82+
# && ln -s /usr/local/lib64/libstdc++.so.6.0.30 /usr/lib/x86_64-linux-gnu/libstdc++.so.6
8383

8484
RUN mkdir /u01/ \
8585
\
@@ -135,12 +135,12 @@ RUN cd /tmp/vectordb && \
135135
make install
136136

137137
# the followings two commands install an http-client library called curlpp
138-
RUN apt-get update && apt-get install -y libcurl4-openssl-dev pkg-config
139-
RUN git clone https://github.com/jpbarrette/curlpp.git && cd curlpp && \
140-
git reset --hard 592552a && \
141-
mkdir build && cd build && \
142-
cmake .. && make -j$(nproc) && \
143-
make install
138+
#RUN apt-get update && apt-get install -y libcurl4-openssl-dev pkg-config
139+
#RUN git clone https://github.com/jpbarrette/curlpp.git && cd curlpp && \
140+
# git reset --hard 592552a && \
141+
# mkdir build && cd build && \
142+
# cmake .. && make -j$(nproc) && \
143+
# make install
144144

145145
ENV PATH="${PATH}:/usr/local/lib"
146146
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
@@ -149,4 +149,5 @@ ENV LANG en_US.utf8
149149
USER postgres
150150
EXPOSE 5432
151151
ENTRYPOINT ["docker-entrypoint.sh"]
152+
#ENTRYPOINT ["sleep","infinity"]
152153

README.md

Lines changed: 61 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
# MSVBASE
2+
## What's NEW!
3+
New Research Paper [VBASE: Unifying Online Vector Similarity Search and Relational Queries via Relaxed Monotonicity](https://www.usenix.org/system/files/osdi23-zhang-qianxi_1.pdf) - _published in OSDI 2023_
4+
## What's MSVBASE!
5+
MSVBASE is a new system capable of efficiently serving complex online queries that involve both approximate similarity search and relational operators on scalar and vector data-sets. MSVBASE builds a unified query execution engine to support a wide range of queries both on scalar and vector data, and shows superior query performance and accuracy.
6+
It integrates high-dimensional vector indices into PostgreSQL with minimal code modifications.
27

3-
MSVBASE is a system that efficiently supports complex queries of both approximate similarity search and relational operators. It integrates high-dimensional vector indices into PostgreSQL, a relational database to facilitate complex approximate similarity queries.
4-
5-
## **Build Docker**
8+
## **Quickstart**
69
### **Clone and Patch**
710
```
811
git clone https://github.com/microsoft/MSVBASE.git
12+
cd MSVBASE
913
git submodule update --init --recursive
1014
./scripts/patch.sh
1115
```
@@ -16,28 +20,75 @@ git submodule update --init --recursive
1620

1721
### **Run**
1822
```
19-
./scripts/dockerbuild.sh
23+
./scripts/dockerrun.sh
2024
```
2125

22-
## **SQL**
23-
It is compatible with PostgreSQL syntax and protocols, supporting vector distance calculations for L2 and Inner product. It also supports hnsw and sptag indexes. Soon, we will be introducing spann and more indexes. Stay tuned!
26+
## **Features and SQL syntax**
27+
It is compatible with PostgreSQL syntax and protocol, supporting vector distance calculations for L2 and Inner product. It also supports [hnsw](https://github.com/nmslib/hnswlib) and [sptag](https://github.com/microsoft/SPTAG/) indices. Soon, we are going to integrate spann and more indices.
2428
### **Command Line**
2529
```
2630
docker exec -it --privileged --user=root vbase_open_source bash
2731
psql -U vectordb
2832
```
29-
### **Example**
33+
34+
### **SQL syntax**
35+
It preserves all the features of PostgreSQL while extending query support on vector data.
36+
* Currently, 'float array' is used to store vectors.
37+
```
38+
create table t_table(id int, price int, vector_1 float8[10], vector_2 float8[10]);
39+
```
40+
* Insert or Import data.
41+
```
42+
insert into t_table values(1, 10, '{1,2,3,4,5,6,7,8,9,0}', '{5,6,7,1,2,3,4,8,9,1}');
43+
copy t_table from 'your_data_path.tsv' DELIMITER E'\t' csv quote e'\x01';
44+
```
45+
* When creating a vector index, it is necessary to specify the algorithm and the distance calculation method to be used.
46+
```
47+
create index vector_index_1 on t_table using hnsw(vector_1) with(dimension=10,distmethod=l2_distance);
48+
create index vector_index_2 on t_table using sptag(vector_2) with(dimension=10,distmethod=inner_product_distance);
49+
```
50+
* **TopK**. When calculating distances, the '<->' operator represents the L2 distance, while '<*>' represents the inner product distance.
51+
```
52+
select id from t_table order by vector_1 <-> '{5,9,8,6,2,1,1,0,4,3}' limit 10;
53+
select id from t_table order by vector_2 <*> '{5,9,8,6,2,1,1,0,4,3}' limit 5;
54+
```
55+
* **TopK + Filter**.
56+
```
57+
select id from t_table where price > 15 order by vector_1 <-> '{5,9,8,6,2,1,1,0,4,3}' limit 10;
58+
select id from t_table where price > 15 order by vector_2 <*> '{5,9,8,6,2,1,1,0,4,3}' limit 5;
59+
```
60+
* **Distance Range Filter**. It also supports distance threshold-based filtering queries. The query will retrieve vector data that is within the distance threshold.
61+
In the query, the '<<->>' operator represents the L2 distance, while '<<*>>' represents the inner product distance.
62+
The first element of the array represents the distance threshold.
63+
```
64+
select id from t_table where price > 15 and vector_1 <<->> '{30,5,9,8,6,2,1,1,0,4,3}';
65+
```
66+
* **Multi-vector Column Query**.
67+
```
68+
select id from t_table
69+
order by approximate_sum('0.5 * vector_1<->{5,9,8,6,2,1,1,0,4,3} + vector_2<*>{5,9,8,6,2,1,1,0,4,3}' ) limit 5;
70+
```
71+
* **Join**. Join on vector similarity with threshold
72+
```
73+
select t_table.id as tid, d_table.id as did
74+
from t_table join d_table
75+
on t_table.vector_2 <<*>> array_cat(ARRAY[cast(10 as float8)], d_table.vector_2);
76+
```
77+
78+
* Example
3079
```
3180
create database test;
3281
\c test;
3382
create extension vectordb;
34-
create table t_table(id int, price int, m_vector_1 float8[10]);
83+
create table t_table(id int, price int, vector_1 float8[10]);
3584
insert into t_table values(1, 10, '{1,2,3,4,5,6,7,8,9,0}');
3685
insert into t_table values(2, 20, '{5,6,7,1,2,3,4,8,9,1}');
3786
insert into t_table values(3, 30, '{9,8,7,6,5,4,3,2,1,0}');
38-
create index t4_index on t_table using hnsw(m_vector_1) with(dimension=10,distmethod=l2_distance);
87+
create index t4_index on t_table using hnsw(vector_1) with(dimension=10,distmethod=l2_distance);
3988
set enable_seqscan=false;
40-
select id from t_table where price > 15 order by m_vector_1 <-> '{5,9,8,6,2,1,1,0,4,3}' limit 1;
89+
select id from t_table where price > 15 order by vector_1 <-> '{5,9,8,6,2,1,1,0,4,3}' limit 1;
90+
insert into t_table values(4, 40, '{19,18,17,16,15,14,13,12,11,10}');
91+
delete from t_table where id = 2;
4192
```
4293

4394
## Contributing

0 commit comments

Comments
 (0)