Skip to content

Commit 12a5a3a

Browse files
committed
add source and scripts
1 parent 4793799 commit 12a5a3a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+8335
-11
lines changed

.gitmodules

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[submodule "thirdparty/Postgres"]
2+
path = thirdparty/Postgres
3+
url = https://github.com/postgres/postgres.git
4+
[submodule "thirdparty/hnsw"]
5+
path = thirdparty/hnsw
6+
url = https://github.com/nmslib/hnswlib.git
7+
[submodule "thirdparty/SPTAG"]
8+
path = thirdparty/SPTAG
9+
url = https://github.com/microsoft/SPTAG.git

CMakeLists.txt

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
cmake_minimum_required(VERSION 3.14.0)
2+
project(vectordb VERSION 0.1.0)
3+
4+
# Find Dependency
5+
6+
set(PostgreSQL_ADDITIONAL_VERSIONS "13")
7+
find_package(PostgreSQL REQUIRED COMPONENTS Server)
8+
9+
# Add Control and SQL Files
10+
11+
set(EXT_CONTROL_FILE ${PROJECT_NAME}.control)
12+
set(EXT_SQL_FILES
13+
sql/${PROJECT_NAME}.sql
14+
# for each version upgrade, add like
15+
# sql/${PROJECT_NAME}--0.1.0--0.1.1.sql
16+
)
17+
18+
list (GET EXT_SQL_FILES 0 EXT_SQL_MAIN)
19+
configure_file(${EXT_CONTROL_FILE}.in ${EXT_CONTROL_FILE})
20+
21+
# Add Dependency
22+
23+
#Include(FetchContent)
24+
25+
#set(LIBRARYONLY ON)
26+
#FetchContent_Declare(
27+
# sptag
28+
# GIT_REPOSITORY https://github.com/microsoft/SPTAG.git
29+
# GIT_TAG master
30+
#)
31+
#FetchContent_GetProperties(sptag)
32+
#if(NOT sptag_POPULATED)
33+
# FetchContent_Populate(sptag)
34+
# add_subdirectory(${sptag_SOURCE_DIR} ${sptag_BINARY_DIR} EXCLUDE_FROM_ALL)
35+
#endif()
36+
37+
#find_package(OpenMP REQUIRED)
38+
39+
if (WIN32)
40+
add_definitions(-D_WIN32_WINNT=0x601)
41+
add_definitions(-DBOOST_THREAD_PROVIDES_NESTED_LOCKS)
42+
else(UNIX)
43+
add_definitions(-DPLATFORM_UNIX)
44+
45+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -maes -mavx2 -fPIC")
46+
set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} -mmwaitx ${CMAKE_EXE_LINKER_FLAGS} -fPIC")
47+
48+
message (STATUS "UNIX: CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
49+
message (STATUS "UNIX: CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
50+
51+
#Include(FetchContent)
52+
53+
#set(LIBRARYONLY ON)
54+
#FetchContent_Declare(
55+
# sptag
56+
# # GIT_REPOSITORY https://github.com/microsoft/SPTAG.git
57+
# # GIT_TAG master
58+
59+
60+
# GIT_REPOSITORY https://qiazh.visualstudio.com/SPTAG/_git/SPTAG
61+
# GIT_TAG qiazh/debug
62+
#)
63+
#FetchContent_GetProperties(sptag)
64+
#if(NOT sptag_POPULATED)
65+
# FetchContent_Populate(sptag)
66+
# add_subdirectory(${sptag_SOURCE_DIR} ${sptag_BINARY_DIR} EXCLUDE_FROM_ALL)
67+
endif()
68+
69+
# Add Extension Library
70+
71+
add_library(${PROJECT_NAME} SHARED
72+
src/lib.cpp
73+
src/index.cpp
74+
src/index_builder.cpp
75+
src/index_scan.cpp
76+
src/hnswindex.cpp
77+
src/hnswindex_builder.cpp
78+
src/hnswindex_scan.cpp
79+
src/pase_hnswindex.cpp
80+
src/pase_hnswindex_builder.cpp
81+
src/pase_hnswindex_scan.cpp
82+
src/operator.cpp
83+
src/util.cpp
84+
src/model_mng.cpp
85+
src/topk.cpp
86+
src/multicol_topk.cpp
87+
)
88+
89+
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_14)
90+
target_compile_options(${PROJECT_NAME} PRIVATE -Wall -mavx2)
91+
set_target_properties(${PROJECT_NAME} PROPERTIES CXX_EXTENSIONS OFF)
92+
set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "")
93+
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
94+
95+
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
96+
97+
include(thirdparty/CMakeLists.txt)
98+
99+
thirdparty_dependent(${PROJECT_NAME})
100+
101+
if(WIN32)
102+
target_include_directories(${PROJECT_NAME} PUBLIC ${PostgreSQL_TYPE_INCLUDE_DIR}/port/win32)
103+
target_link_libraries(${PROJECT_NAME} PUBLIC ${PostgreSQL_LIBRARY_DIRS}/postgres.lib)
104+
if(MSVC)
105+
target_include_directories(${PROJECT_NAME} PUBLIC ${PostgreSQL_TYPE_INCLUDE_DIR}/port/win32_msvc)
106+
target_compile_options(${PROJECT_NAME} PRIVATE /TC)
107+
endif(MSVC)
108+
endif(WIN32)
109+
110+
# Installation Configuration
111+
112+
## Find PostgreSQL Paths
113+
114+
foreach(suffix ${PostgreSQL_KNOWN_VERSIONS})
115+
if(WIN32)
116+
list(APPEND PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES
117+
"PostgreSQL/${suffix}/bin")
118+
endif()
119+
if(UNIX)
120+
list(APPEND PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES
121+
"postgresql${suffix}"
122+
"postgresql/${suffix}"
123+
"pgsql-${suffix}/bin")
124+
endif()
125+
endforeach()
126+
find_program(PostgreSQL_PG_CONFIG pg_config
127+
NAMES ${ARGN}
128+
PATHS
129+
${PostgreSQL_ROOT_DIRECTORIES}
130+
PATH_SUFFIXES
131+
bin
132+
${PostgreSQL_BINARY_ADDITIONAL_SEARCH_SUFFIXES}
133+
# Help the user find it if we cannot.
134+
DOC "Set the PostgreSQL_BINARY_DIR cmake cache entry to the top-level directory containing the PostgreSQL binaries."
135+
)
136+
if (NOT PostgreSQL_PG_CONFIG)
137+
message(FATAL_ERROR "Unable to find 'pg_config'")
138+
endif ()
139+
message(STATUS "Using pg_config ${PostgreSQL_PG_CONFIG}")
140+
141+
execute_process(
142+
COMMAND ${PostgreSQL_PG_CONFIG} --sharedir
143+
OUTPUT_VARIABLE PostgreSQL_SHAREDIR
144+
OUTPUT_STRIP_TRAILING_WHITESPACE)
145+
execute_process(
146+
COMMAND ${PostgreSQL_PG_CONFIG} --pkglibdir
147+
OUTPUT_VARIABLE PostgreSQL_PKGLIBDIR
148+
OUTPUT_STRIP_TRAILING_WHITESPACE)
149+
150+
## Install Files
151+
152+
install(
153+
FILES ${CMAKE_CURRENT_BINARY_DIR}/${EXT_CONTROL_FILE}
154+
DESTINATION "${PostgreSQL_SHAREDIR}/extension")
155+
foreach(EXT_SQL_FILE ${EXT_SQL_FILES})
156+
install(
157+
FILES ${CMAKE_CURRENT_SOURCE_DIR}/${EXT_SQL_FILE}
158+
DESTINATION "${PostgreSQL_SHAREDIR}/extension")
159+
endforeach(EXT_SQL_FILE)
160+
install(
161+
FILES ${CMAKE_CURRENT_SOURCE_DIR}/${EXT_SQL_MAIN}
162+
DESTINATION "${PostgreSQL_SHAREDIR}/extension"
163+
RENAME ${PROJECT_NAME}--${PROJECT_VERSION}.sql)
164+
install(
165+
TARGETS ${PROJECT_NAME}
166+
DESTINATION ${PostgreSQL_PKGLIBDIR})

Dockerfile

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
FROM debian:bullseye-slim
2+
3+
# make the "en_US.UTF-8" locale so postgres will be utf-8 enabled by default
4+
ENV LANG en_US.utf8
5+
ENV PG_MAJOR 13
6+
ENV PG_VERSION 13.4
7+
ENV PGDATA /u02/pgdata
8+
ENV PGDATABASE "" \
9+
PGUSERNAME "" \
10+
PGPASSWORD ""
11+
12+
ARG UID=999
13+
ARG GID=999
14+
15+
ENV PG_INSTALL_DIR /u01/app/postgres/product/${PG_VERSION}
16+
17+
RUN set -ex \
18+
\
19+
&& apt-get update && apt-get install -y \
20+
gettext \
21+
ca-certificates \
22+
build-essential \
23+
curl \
24+
procps \
25+
sysstat \
26+
libldap2-dev \
27+
python-dev \
28+
libreadline-dev \
29+
libssl-dev \
30+
bison \
31+
flex \
32+
libghc-zlib-dev \
33+
libcrypto++-dev \
34+
libxml2-dev \
35+
libxslt1-dev \
36+
bzip2 \
37+
make \
38+
gcc \
39+
unzip \
40+
python \
41+
locales \
42+
wget \
43+
\
44+
&& rm -rf /var/lib/apt/lists/* \
45+
&& localedef -i en_US -c -f UTF-8 en_US.UTF-8
46+
47+
RUN apt-get update && \
48+
apt-get install -y --no-install-recommends wget git golang-go python-dev swig vim\
49+
libboost-filesystem-dev libboost-test-dev libboost-serialization-dev libboost-regex-dev libboost-serialization-dev libboost-regex-dev libboost-thread-dev libboost-system-dev
50+
51+
RUN wget "https://boostorg.jfrog.io/artifactory/main/release/1.71.0/source/boost_1_71_0.tar.gz" --no-check-certificate -q -O - \
52+
| tar -xz && \
53+
cd boost_1_71_0 && \
54+
./bootstrap.sh && \
55+
./b2 install && \
56+
ldconfig && \
57+
cd .. && rm -rf boost_1_71_0
58+
59+
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.tar.gz" --no-check-certificate -q -O - \
60+
| tar -xz --strip-components=1 -C /usr/local
61+
62+
RUN apt-get install -y software-properties-common && add-apt-repository 'deb http://archive.debian.org/debian stretch stretch-security main contrib non-free' && apt-get update && apt-get install -y openjdk-8-jdk
63+
64+
RUN apt install -y pip && pip install numpy && pip install pandas
65+
66+
RUN git config --global http.sslverify false
67+
68+
RUN wget http://ftp.gnu.org/gnu/gcc/gcc-12.2.0/gcc-12.2.0.tar.gz \
69+
&& tar -zxvf gcc-12.2.0.tar.gz \
70+
&& cd gcc-12.2.0 \
71+
&& ./contrib/download_prerequisites \
72+
&& mkdir build \
73+
&& cd build/ \
74+
&& ../configure -enable-checking=release -enable-languages=c,c++ -disable-multilib \
75+
&& make -j$(nproc) \
76+
&& make install \
77+
&& rm /usr/bin/gcc \
78+
&& ln -s /usr/local/bin/gcc /usr/bin/gcc \
79+
&& rm /usr/bin/g++ \
80+
&& ln -s /usr/local/bin/g++ /usr/bin/g++ \
81+
&& rm /usr/lib/x86_64-linux-gnu/libstdc++.so.6 \
82+
&& ln -s /usr/local/lib64/libstdc++.so.6.0.30 /usr/lib/x86_64-linux-gnu/libstdc++.so.6
83+
84+
RUN mkdir /u01/ \
85+
\
86+
&& groupadd -r postgres --gid=$GID \
87+
&& useradd -m -r -g postgres --uid=$UID postgres \
88+
&& chown postgres:postgres /u01/ \
89+
&& mkdir -p "$PGDATA" \
90+
&& chown -R postgres:postgres "$PGDATA" \
91+
&& chmod 700 "$PGDATA"
92+
93+
COPY ./thirdparty/Postgres /home/postgres/src/
94+
RUN echo "this line could be executed for a very long time" && chown -R postgres:postgres /home/postgres/src
95+
96+
RUN cd /home/postgres/src \
97+
&& su postgres -c "./configure \
98+
--with-blocksize=32 \
99+
--enable-integer-datetimes \
100+
--enable-thread-safety \
101+
--with-pgport=5432 \
102+
--prefix=$PG_INSTALL_DIR \
103+
--with-ldap \
104+
--with-python \
105+
--with-openssl \
106+
--with-libxml \
107+
--with-libxslt \
108+
--enable-nls=yes" \
109+
# --enable-debug \
110+
# --enable-cassert \
111+
# CFLAGS='-ggdb -O0'" \
112+
&& su postgres -c "make -j$(nproc) all" \
113+
&& su postgres -c "make install" \
114+
&& su postgres -c "make -C contrib install" \
115+
&& apt-get install -y libxml2
116+
117+
ENV PGDATA ${PGDATA}/${PG_MAJOR}
118+
COPY ./scripts/pg_scripts/*.sh /usr/local/bin/
119+
120+
ENV PATH="${PATH}:${PG_INSTALL_DIR}/bin"
121+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${PG_INSTALL_DIR}/lib"
122+
123+
COPY . /tmp/vectordb
124+
125+
RUN chown -R postgres:postgres /tmp/vectordb
126+
127+
ENV PostgreSQL_ROOT ${PG_INSTALL_DIR}
128+
129+
130+
RUN cd /tmp/vectordb && \
131+
mkdir build && \
132+
cd build && \
133+
cmake -DCMAKE_INSTALL_PREFIX=/usr/local/vectordb -DLIBRARYONLY=ON -DSEEK_ENABLE_TESTS=ON -DCMAKE_BUILD_TYPE=Release .. && \
134+
make -j$(nproc) && \
135+
make install
136+
137+
# the followings two commands install an http-client library called curlpp
138+
RUN apt-get update && apt-get install -y libcurl4-openssl-dev pkg-config
139+
RUN git clone https://github.com/jpbarrette/curlpp.git && cd curlpp && \
140+
git reset --hard 592552a && \
141+
mkdir build && cd build && \
142+
cmake .. && make -j$(nproc) && \
143+
make install
144+
145+
ENV PATH="${PATH}:/usr/local/lib"
146+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
147+
148+
ENV LANG en_US.utf8
149+
USER postgres
150+
EXPOSE 5432
151+
ENTRYPOINT ["docker-entrypoint.sh"]
152+

README.md

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,44 @@
1-
# Project
2-
3-
> This repo has been populated by an initial template to help get you started. Please
4-
> make sure to update the content to build a great experience for community-building.
5-
6-
As the maintainer of this project, please make a few updates:
7-
8-
- Improving this README.MD file to provide a great experience
9-
- Updating SUPPORT.MD with content about this project's support experience
10-
- Understanding the security reporting process in SECURITY.MD
11-
- Remove this section from the README
1+
# MSVBASE
2+
3+
MSVBASE is a system that efficiently supports complex queries of both approximate similarity search and relational operators. It integrates high-dimensional vector indices into PostgreSQL, a relational database to facilitate complex approximate similarity queries.
4+
5+
## **Build Docker**
6+
### **Clone and Patch**
7+
```
8+
git clone https://github.com/microsoft/MSVBASE.git
9+
git submodule update --init --recursive
10+
./scripts/patch.sh
11+
```
12+
### **Build**
13+
```
14+
./scripts/dockerbuild.sh
15+
```
16+
17+
### **Run**
18+
```
19+
./scripts/dockerbuild.sh
20+
```
21+
22+
## **SQL**
23+
It is compatible with PostgreSQL syntax and protocols, supporting vector distance calculations for L2 and Inner product. It also supports hnsw and sptag indexes. Soon, we will be introducing spann and more indexes. Stay tuned!
24+
### **Command Line**
25+
```
26+
docker exec -it --privileged --user=root vbase_open_source bash
27+
psql -U vectordb
28+
```
29+
### **Example**
30+
```
31+
create database test;
32+
\c test;
33+
create extension vectordb;
34+
create table t_table(id int, price int, m_vector_1 float8[10]);
35+
insert into t_table values(1, 10, '{1,2,3,4,5,6,7,8,9,0}');
36+
insert into t_table values(2, 20, '{5,6,7,1,2,3,4,8,9,1}');
37+
insert into t_table values(3, 30, '{9,8,7,6,5,4,3,2,1,0}');
38+
create index t4_index on t_table using hnsw(m_vector_1) with(dimension=10,distmethod=l2_distance);
39+
set enable_seqscan=false;
40+
select id from t_table where price > 15 order by m_vector_1 <-> '{5,9,8,6,2,1,1,0,4,3}' limit 1;
41+
```
1242

1343
## Contributing
1444

0 commit comments

Comments
 (0)