xataio
diff --git a/‎README.md
+68-7 b/‎README.md
+68-7
diff --git a/‎cmd/config.go
+20-10 b/‎cmd/config.go
+20-10
diff --git a/‎pg2pg.env
+1 b/‎pg2pg.env
+1
diff --git a/‎pkg/stream/config.go
+2-1 b/‎pkg/stream/config.go
+2-1
diff --git a/‎pkg/stream/integration/helper_test.go
+20-1 b/‎pkg/stream/integration/helper_test.go
+20-1
diff --git a/‎pkg/stream/integration/snapshot_pg_integration_test.go
+92 b/‎pkg/stream/integration/snapshot_pg_integration_test.go
+92
diff --git a/‎pkg/stream/stream_run.go
+20-1 b/‎pkg/stream/stream_run.go
+20-1
@@ -20,20 +20,23 @@
 - Fast initial snapshots
 - Column value transformations
 - Modular deployment configuration, only requires Postgres
-- Schema based message partitioning
-- Schema filtering
-- Elasticsearch/OpenSearch replication output plugin support
-- Webhook support
+- Multiple out of the box supported outputs
+  - Elasticsearch/OpenSearch
+  - Webhooks
+  - PostgreSQL
+- Kafka support with schema based partitioning
+- Snapshot only mode (for when you don't need continuous replication)
+- Extendable support for custom output plugins
 - Automatic discovery of table primary key/unique not null columns for use as event identity
-- Highly customisable modules when used as library
 - Core metrics available via opentelemetry
-- Extendable support for custom replication output plugins
 - Continuous consumption of replication slot with configurable memory guards
 
 ## Table of Contents
 
 - [Usage](#usage)
 - [Configuration](#configuration)
+- [Tracking Schema Changes](#tracking-schema-changes)
+- [Snapshots](#snapshots)
 - [Architecture](#architecture)
 - [Limitations](#limitations)
 - [Glossary](#glossary)
@@ -79,6 +82,25 @@ If you have an environment available, with at least Postgres and whichever modul
 docker-compose -f build/docker/docker-compose.yml up
 ```
 
+The docker-compose file has profiles that can be used in order to bring up only the relevant containers. If for example you only want to run PostgreSQL to PostgreSQL pgstream replication you can use the `pg2pg` profile as follows:
+
+```
+docker-compose -f build/docker/docker-compose.yml --profile pg2pg up
+```
+
+You can also run multiple profiles. For example to start two PostgreSQL instances and Kafka:
+
+```
+docker-compose -f build/docker/docker-compose.yml --profile pg2pg --profile kafka up
+```
+
+List of supported docker profiles:
+
+- pg2pg
+- pg2os
+- pg2webhook
+- kafka
+
 #### Prepare the database
 
 This will create the `pgstream` schema in the configured Postgres database, along with the tables/functions/triggers required to keep track of the schema changes. See [Tracking schema changes](#tracking-schema-changes) section for more details. It will also create a replication slot for the configured database which will be used by the pgstream service.
@@ -118,6 +140,18 @@ pgstream run -c pg2kafka.env --log-level trace
 pgstream run -c kafka2os.env --log-level trace
 ```
 
+Example running pgstream with PostgreSQL -> PostgreSQL with initial snapshot enabled:
+
+```
+pgstream run -c pg2pg.env --log-level trace
+```
+
+Example running pgstream with PostgreSQL snapshot only mode -> PostgreSQL:
+
+```
+pgstream run -c snapshot2pg.env --log-level trace
+```
+
 The run command will parse the configuration provided, and initialise the configured modules. It requires at least one listener and one processor.
 
 ## Configuration
@@ -227,6 +261,21 @@ One of exponential/constant backoff policies can be provided for the search stor
 
 </details>
 
+<details>
+  <summary>Postgres Batch Writer</summary>
+
+| Environment Variable                         | Default | Required | Description                                                                                                      |
+| -------------------------------------------- | ------- | -------- | ---------------------------------------------------------------------------------------------------------------- | --- |
+| PGSTREAM_POSTGRES_WRITER_TARGET_URL          | N/A     | Yes      | URL for the PostgreSQL store to connect to                                                                       |
+| PGSTREAM_POSTGRES_WRITER_BATCH_TIMEOUT       | 1s      | No       | Max time interval at which the batch sending to PostgreSQL is triggered.                                         |
+| PGSTREAM_POSTGRES_WRITER_BATCH_SIZE          | 100     | No       | Max number of messages to be sent per batch. When this size is reached, the batch is sent to PostgreSQL.         |
+| PGSTREAM_POSTGRES_WRITER_MAX_QUEUE_BYTES     | 100MiB  | No       | Max memory used by the postgres batch writer for inflight batches.                                               |     |
+| PGSTREAM_POSTGRES_WRITER_BATCH_BYTES         | 1572864 | No       | Max size in bytes for a given batch. When this size is reached, the batch is sent to PostgreSQL.                 |
+| PGSTREAM_POSTGRES_WRITER_SCHEMALOG_STORE_URL | N/A     | No       | URL of the store where the pgstream schemalog table which keeps track of schema changes is.                      |
+| PGSTREAM_POSTGRES_WRITER_DISABLE_TRIGGERS    | False   | No       | Option to disable triggers on the target PostgreSQL database while performing the snaphot/replication streaming. |
+
+</details>
+
 <details>
   <summary>Injector</summary>
 
@@ -253,6 +302,14 @@ The detailed SQL used can be found in the [migrations folder](https://github.com
 
 The schema and data changes are part of the same linear stream - the downstream consumers always observe the schema changes as soon as they happen, before any data arrives that relies on the new schema. This prevents data loss and manual intervention.
 
+## Snapshots
+
+`pgstream` can handle the generation of PostgreSQL snapshots, including both schema and data. The current implementations for each are:
+
+- Schema: depending on the configuration, it can use either the pgstream `schema_log` table to get the schema view and process it as events downstream, or rely on the `pg_dump`/`pg_restore` PostgreSQL utilities.
+
+- Data: it relies on transaction snapshot ids to obtain a stable view of the database, and paralellises the read of all the rows by dividing them into ranges using the `ctid`.
+
 ## Architecture
 
 `pgstream` is constructed as a streaming pipeline, where data from one module streams into the next, eventually reaching the configured output plugins. It keeps track of schema changes and replicates them along with the data changes to ensure a consistent view of the source data downstream. This modular approach makes adding and integrating output plugin implementations simple and painless.
@@ -267,7 +324,9 @@ A listener is anything that listens for WAL data, regardless of the source. It h
 
 There are currently two implementations of the listener:
 
-- **Postgres listener**: listens to WAL events directly from the replication slot. Since the WAL replication slot is sequential, the Postgres WAL listener is limited to run as a single process. The associated Postgres checkpointer will sync the LSN so that the replication lag doesn't grow indefinitely.
+- **Postgres listener**: listens to WAL events directly from the replication slot. Since the WAL replication slot is sequential, the Postgres WAL listener is limited to run as a single process. The associated Postgres checkpointer will sync the LSN so that the replication lag doesn't grow indefinitely. It can be configured to perform an initial snapshot when pgstream is first connected to the source PostgreSQL database (see details in the [snapshots section](#snapshots)).
+
+- **Postgres Snapshoter**: produces events by performing a snapshot of the configured PostgreSQL database, as described in the [snapshots section](#snapshots). It doesn't start continuous replication, so once all the snapshotted data has been processed, the pgstream process will stop.
 
 - **Kafka reader**: reads WAL events from a Kafka topic. It can be configured to run concurrently by using partitions and Kafka consumer groups, applying a fan-out strategy to the WAL events. The data will be partitioned by database schema by default, but can be configured when using `pgstream` as a library. The associated Kafka checkpointer will commit the message offsets per topic/partition so that the consumer group doesn't process the same message twice.
 
@@ -283,6 +342,8 @@ There are currently two implementations of the processor:
 
 - **Webhook notifier**: it sends a notification to any webhooks that have subscribed to the relevant wal event. It relies on a subscription HTTP server receiving the subscription requests and storing them in the shared subscription store which is accessed whenever a wal event is processed. It sends the notifications to the different subscribed webhook urls in parallel based on a configurable number of workers (client timeouts apply). Similar to the two previous processor implementations, it uses a memory guarded buffering system internally, which allows to separate the wal event processing from the webhook url sending, optimising the processor latency.
 
+- **Postgres batch writer**: it writes the WAL events into a PostgreSQL compatible database. It implements the same kind of mechanism than the Kafka and the search batch writers to ensure continuous processing from the listener, and it also uses a batching mechanism to minimise PostgreSQL IO traffic.
+
 In addition to the implementations described above, there are optional processor decorators, which work in conjunction with one of the main processor implementations described above. Their goal is to act as modifiers to enrich the wal event being processed.
 
 There are currently two implementations of the processor that act as decorators:
 
@@ -69,6 +69,7 @@ func parseListenerConfig() stream.ListenerConfig {
 	return stream.ListenerConfig{
 		Postgres: parsePostgresListenerConfig(),
 		Kafka:    parseKafkaListenerConfig(),
+		Snapshot: parseSnapshotListenerConfig(),
 	}
 }
 
@@ -87,24 +88,32 @@ func parsePostgresListenerConfig() *stream.PostgresListenerConfig {
 
 	initialSnapshotEnabled := viper.GetBool("PGSTREAM_POSTGRES_LISTENER_INITIAL_SNAPSHOT_ENABLED")
 	if initialSnapshotEnabled {
-		cfg.Snapshot = parseSnapshotListenerConfig(pgURL)
+		cfg.Snapshot = parseSnapshotConfig(pgURL, "PGSTREAM_POSTGRES_INITIAL")
 	}
 
 	return cfg
 }
 
-func parseSnapshotListenerConfig(pgURL string) *snapshotbuilder.SnapshotListenerConfig {
+func parseSnapshotListenerConfig() *snapshotbuilder.SnapshotListenerConfig {
+	pgsnapshotURL := viper.GetString("PGSTREAM_POSTGRES_SNAPSHOT_LISTENER_URL")
+	if pgsnapshotURL == "" {
+		return nil
+	}
+	return parseSnapshotConfig(pgsnapshotURL, "PGSTREAM_POSTGRES")
+}
+
+func parseSnapshotConfig(pgURL, prefix string) *snapshotbuilder.SnapshotListenerConfig {
 	return &snapshotbuilder.SnapshotListenerConfig{
-		SnapshotStoreURL: pgURL,
+		SnapshotStoreURL: viper.GetString(fmt.Sprintf("%s_SNAPSHOT_STORE_URL", prefix)),
 		Generator: pgsnapshotgenerator.Config{
 			URL:           pgURL,
-			BatchPageSize: viper.GetUint("PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_BATCH_PAGE_SIZE"),
-			SchemaWorkers: viper.GetUint("PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_SCHEMA_WORKERS"),
-			TableWorkers:  viper.GetUint("PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_TABLE_WORKERS"),
+			BatchPageSize: viper.GetUint(fmt.Sprintf("%s_SNAPSHOT_BATCH_PAGE_SIZE", prefix)),
+			SchemaWorkers: viper.GetUint(fmt.Sprintf("%s_SNAPSHOT_SCHEMA_WORKERS", prefix)),
+			TableWorkers:  viper.GetUint(fmt.Sprintf("%s_SNAPSHOT_TABLE_WORKERS", prefix)),
 		},
 		Adapter: adapter.SnapshotConfig{
-			Tables:          viper.GetStringSlice("PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_TABLES"),
-			SnapshotWorkers: viper.GetUint("PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_WORKERS"),
+			Tables:          viper.GetStringSlice(fmt.Sprintf("%s_SNAPSHOT_TABLES", prefix)),
+			SnapshotWorkers: viper.GetUint(fmt.Sprintf("%s_SNAPSHOT_WORKERS", prefix)),
 		},
 		Schema: parseSchemaSnapshotConfig(pgURL),
 	}
@@ -273,9 +282,10 @@ func parsePostgresProcessorConfig() *stream.PostgresProcessorConfig {
 				MaxBatchSize:  viper.GetInt64("PGSTREAM_POSTGRES_WRITER_BATCH_SIZE"),
 				MaxQueueBytes: viper.GetInt64("PGSTREAM_POSTGRES_WRITER_MAX_QUEUE_BYTES"),
 			},
-			SchemaStore: pgschemalog.Config{
-				URL: viper.GetString("PGSTREAM_POSTGRES_WRITER_SCHEMA_STORE_URL"),
+			SchemaLogStore: pgschemalog.Config{
+				URL: viper.GetString("PGSTREAM_POSTGRES_WRITER_SCHEMALOG_STORE_URL"),
 			},
+			DisableTriggers: viper.GetBool("PGSTREAM_POSTGRES_WRITER_DISABLE_TRIGGERS"),
 		},
 	}
 }
 
@@ -1,6 +1,7 @@
 # Listener config
 PGSTREAM_POSTGRES_LISTENER_URL="postgres://postgres:postgres@localhost?sslmode=disable"
 PGSTREAM_POSTGRES_LISTENER_INITIAL_SNAPSHOT_ENABLED=true
+PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_STORE_URL="postgres://postgres:postgres@localhost?sslmode=disable"
 PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_TABLES="*"
 PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_SCHEMA_WORKERS=4
 PGSTREAM_POSTGRES_INITIAL_SNAPSHOT_TABLE_WORKERS=4
 
@@ -28,6 +28,7 @@ type Config struct {
 type ListenerConfig struct {
 	Postgres *PostgresListenerConfig
 	Kafka    *KafkaListenerConfig
+	Snapshot *snapshotbuilder.SnapshotListenerConfig
 }
 
 type PostgresListenerConfig struct {
@@ -76,7 +77,7 @@ type WebhookSubscriptionStoreConfig struct {
 }
 
 func (c *Config) IsValid() error {
-	if c.Listener.Kafka == nil && c.Listener.Postgres == nil {
+	if c.Listener.Kafka == nil && c.Listener.Postgres == nil && c.Listener.Snapshot == nil {
 		return errors.New("need at least one listener configured")
 	}
 
 
@@ -121,6 +121,25 @@ func testPostgresListenerCfg() stream.ListenerConfig {
 	}
 }
 
+func testSnapshotListenerCfg(sourceURL, targetURL string, tables []string) stream.ListenerConfig {
+	return stream.ListenerConfig{
+		Snapshot: &snapshotbuilder.SnapshotListenerConfig{
+			Generator: pgsnapshotgenerator.Config{
+				URL: sourceURL,
+			},
+			Adapter: adapter.SnapshotConfig{
+				Tables: tables,
+			},
+			Schema: snapshotbuilder.SchemaSnapshotConfig{
+				DumpRestore: &pgdumprestore.Config{
+					SourcePGURL: sourceURL,
+					TargetPGURL: targetURL,
+				},
+			},
+		},
+	}
+}
+
 func testPostgresListenerCfgWithSnapshot(sourceURL, targetURL string, tables []string) stream.ListenerConfig {
 	return stream.ListenerConfig{
 		Postgres: &stream.PostgresListenerConfig{
@@ -209,7 +228,7 @@ func testPostgresProcessorCfg(sourcePGURL string) stream.ProcessorConfig {
 				BatchConfig: batch.Config{
 					BatchTimeout: 50 * time.Millisecond,
 				},
-				SchemaStore: schemalogpg.Config{
+				SchemaLogStore: schemalogpg.Config{
 					URL: sourcePGURL,
 				},
 			},
 
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: Apache-2.0
+
+package integration
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+	pglib "github.com/xataio/pgstream/internal/postgres"
+	"github.com/xataio/pgstream/internal/testcontainers"
+	"github.com/xataio/pgstream/pkg/stream"
+)
+
+func Test_SnapshotToPostgres(t *testing.T) {
+	if os.Getenv("PGSTREAM_INTEGRATION_TESTS") == "" {
+		t.Skip("skipping integration test...")
+	}
+
+	// postgres container where pgstream hasn't been initialised to be used for
+	// snapshot validation
+	var snapshotPGURL string
+	pgcleanup, err := testcontainers.SetupPostgresContainer(context.Background(), &snapshotPGURL, testcontainers.Postgres14, "config/postgresql.conf")
+	require.NoError(t, err)
+	defer pgcleanup()
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	testTable := "snapshot2pg_integration_test"
+	// create table and populate it before initialising and running pgstream to
+	// ensure the snapshot captures pre-existing schema and data properly
+	execQueryWithURL(t, ctx, snapshotPGURL, fmt.Sprintf("create table %s(id serial primary key, name text)", testTable))
+	execQueryWithURL(t, ctx, snapshotPGURL, fmt.Sprintf("insert into %s(name) values('a'),('b')", testTable))
+
+	cfg := &stream.Config{
+		Listener:  testSnapshotListenerCfg(snapshotPGURL, targetPGURL, []string{testTable}),
+		Processor: testPostgresProcessorCfg(snapshotPGURL),
+	}
+	initStream(t, ctx, snapshotPGURL)
+	runStream(t, ctx, cfg)
+
+	targetConn, err := pglib.NewConn(ctx, targetPGURL)
+	require.NoError(t, err)
+
+	timer := time.NewTimer(20 * time.Second)
+	defer timer.Stop()
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+
+	validation := func() bool {
+		schemaColumns := getInformationSchemaColumns(t, ctx, targetConn, testTable)
+		if len(schemaColumns) != 2 {
+			return false
+		}
+
+		wantSchemaCols := []*informationSchemaColumn{
+			{name: "id", dataType: "integer", isNullable: "NO"},
+			{name: "name", dataType: "text", isNullable: "YES"},
+		}
+		require.ElementsMatch(t, wantSchemaCols, schemaColumns)
+
+		columns := getTestTableColumns(t, ctx, targetConn, fmt.Sprintf("select id,name from %s", testTable))
+		if len(columns) != 2 {
+			return false
+		}
+
+		wantCols := []*testTableColumn{
+			{id: 1, name: "a"},
+			{id: 2, name: "b"},
+		}
+		require.ElementsMatch(t, wantCols, columns)
+
+		return true
+	}
+
+	for {
+		select {
+		case <-timer.C:
+			cancel()
+			t.Error("timeout waiting for postgres snapshot sync")
+			return
+		case <-ticker.C:
+			if validation() {
+				return
+			}
+		}
+	}
+}
@@ -17,6 +17,7 @@ import (
 	"github.com/xataio/pgstream/pkg/wal/listener"
 	kafkalistener "github.com/xataio/pgstream/pkg/wal/listener/kafka"
 	pglistener "github.com/xataio/pgstream/pkg/wal/listener/postgres"
+	snapshotlistener "github.com/xataio/pgstream/pkg/wal/listener/snapshot"
 	snapshotbuilder "github.com/xataio/pgstream/pkg/wal/listener/snapshot/builder"
 	"github.com/xataio/pgstream/pkg/wal/processor"
 	"github.com/xataio/pgstream/pkg/wal/processor/injector"
@@ -283,7 +284,6 @@ func Run(ctx context.Context, logger loglib.Logger, config *Config, instrumentat
 			return listener.Listen(ctx)
 		})
 	case config.Listener.Kafka != nil:
-		var err error
 		listener, err := kafkalistener.NewWALReader(
 			kafkaReader,
 			processor.ProcessWALEvent,
@@ -298,6 +298,25 @@ func Run(ctx context.Context, logger loglib.Logger, config *Config, instrumentat
 			logger.Info("running kafka reader...")
 			return listener.Listen(ctx)
 		})
+
+	case config.Listener.Snapshot != nil:
+		var err error
+		snapshotGenerator, err := snapshotbuilder.NewSnapshotGenerator(
+			ctx,
+			config.Listener.Snapshot,
+			processor.ProcessWALEvent,
+			logger)
+		if err != nil {
+			return err
+		}
+		listener := snapshotlistener.New(snapshotGenerator)
+		defer listener.Close()
+
+		eg.Go(func() error {
+			defer logger.Info("stopping postgres snapshot listener...")
+			logger.Info("running postgres snapshot listener...")
+			return listener.Listen(ctx)
+		})
 	}
 
 	if err := eg.Wait(); err != nil {
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ type Config struct {`
`28`	`28`	`type ListenerConfig struct {`
`29`	`29`	`Postgres *PostgresListenerConfig`
`30`	`30`	`Kafka *KafkaListenerConfig`
	`31`	`+ Snapshot *snapshotbuilder.SnapshotListenerConfig`
`31`	`32`	`}`
`32`	`33`
`33`	`34`	`type PostgresListenerConfig struct {`
`@@ -76,7 +77,7 @@ type WebhookSubscriptionStoreConfig struct {`
`76`	`77`	`}`
`77`	`78`
`78`	`79`	`func (c *Config) IsValid() error {`
`79`		`- if c.Listener.Kafka == nil && c.Listener.Postgres == nil {`
	`80`	`+ if c.Listener.Kafka == nil && c.Listener.Postgres == nil && c.Listener.Snapshot == nil {`
`80`	`81`	`return errors.New("need at least one listener configured")`
`81`	`82`	`}`
`82`	`83`