Skip to content

Commit

Permalink
Merge pull request #7263 from neondatabase/rc/2024-03-27
Browse files Browse the repository at this point in the history
Release 2024-03-27 - compute only release
  • Loading branch information
skyzh authored Mar 27, 2024
2 parents 4e5724d + 24c5a5a commit c431e2f
Show file tree
Hide file tree
Showing 45 changed files with 1,140 additions and 706 deletions.
90 changes: 52 additions & 38 deletions .github/workflows/trigger-e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ jobs:

trigger-e2e-tests:
needs: [ tag ]
runs-on: [ self-hosted, gen3, small ]
runs-on: ubuntu-latest
env:
TAG: ${{ needs.tag.outputs.build-tag }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
steps:
- name: check if ecr image are present
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
run: |
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
Expand All @@ -79,41 +79,55 @@ jobs:
fi
done
- name: Set PR's status to pending and request a remote CI test
- name: Set e2e-platforms
id: e2e-platforms
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
# Default set of platforms to run e2e tests on
platforms='["docker", "k8s"]'
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
# If the workflow run is not a pull request, add k8s-neonvm to the list.
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
case "$f" in
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
;;
*)
# no-op
;;
esac
done
else
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
fi
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
REMOTE_REPO="${{ github.repository_owner }}/cloud"
- name: Set PR's status to pending and request a remote CI test
env:
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"state\": \"pending\",
\"context\": \"neon-cloud-e2e\",
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
}"
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
--method POST \
--raw-field "state=pending" \
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
--raw-field "context=neon-cloud-e2e"
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${TAG}\",
\"compute_image_tag\": \"${TAG}\",
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
}
}"
gh workflow --repo ${REMOTE_REPO} \
run testing.yml \
--ref "main" \
--raw-field "ci_job_name=neon-cloud-e2e" \
--raw-field "commit_hash=$COMMIT_SHA" \
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
--raw-field "storage_image_tag=${TAG}" \
--raw-field "compute_image_tag=${TAG}" \
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 3 additions & 14 deletions compute_tools/src/spec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
RoleAction::Create => {
// This branch only runs when roles are created through the console, so it is
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
// from neon_superuser.
let mut query: String = format!(
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
name.pg_quote()
);
info!("running role create query: '{}'", &query);
Expand Down Expand Up @@ -806,19 +806,8 @@ $$;"#,
"",
"",
"",
"",
// Add new migrations below.
r#"
DO $$
DECLARE
role_name TEXT;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
END LOOP;
END
$$;"#,
];

let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
Expand Down
31 changes: 25 additions & 6 deletions control_plane/attachment_service/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1523,6 +1523,8 @@ impl Service {
&self,
create_req: TenantCreateRequest,
) -> Result<TenantCreateResponse, ApiError> {
let tenant_id = create_req.new_tenant_id.tenant_id;

// Exclude any concurrent attempts to create/access the same tenant ID
let _tenant_lock = self
.tenant_op_locks
Expand All @@ -1531,7 +1533,12 @@ impl Service {

let (response, waiters) = self.do_tenant_create(create_req).await?;

self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
// Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
// accept compute notifications while it is in the process of creating. Reconciliation will
// be retried in the background.
tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})");
}
Ok(response)
}

Expand Down Expand Up @@ -1610,13 +1617,25 @@ impl Service {
splitting: SplitState::default(),
})
.collect();
self.persistence

match self
.persistence
.insert_tenant_shards(persist_tenant_shards)
.await
.map_err(|e| {
// TODO: distinguish primary key constraint (idempotent, OK), from other errors
ApiError::InternalServerError(anyhow::anyhow!(e))
})?;
{
Ok(_) => {}
Err(DatabaseError::Query(diesel::result::Error::DatabaseError(
DatabaseErrorKind::UniqueViolation,
_,
))) => {
// Unique key violation: this is probably a retry. Because the shard count is part of the unique key,
// if we see a unique key violation it means that the creation request's shard count matches the previous
// creation's shard count.
tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
}
// Any other database error is unexpected and a bug.
Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
};

let (waiters, response_shards) = {
let mut locked = self.inner.write().unwrap();
Expand Down
2 changes: 1 addition & 1 deletion libs/metrics/src/hll.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ macro_rules! register_hll {
}};

($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
$crate::register_hll!($N, $crate::opts!($NAME, $HELP))
}};
}

Expand Down
1 change: 1 addition & 0 deletions pageserver/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ signal-hook.workspace = true
smallvec = { workspace = true, features = ["write"] }
svg_fmt.workspace = true
sync_wrapper.workspace = true
sysinfo.workspace = true
tokio-tar.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
Expand Down
58 changes: 31 additions & 27 deletions pageserver/src/bin/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,33 +600,37 @@ fn start_pageserver(
None,
"consumption metrics collection",
true,
async move {
// first wait until background jobs are cleared to launch.
//
// this is because we only process active tenants and timelines, and the
// Timeline::get_current_logical_size will spawn the logical size calculation,
// which will not be rate-limited.
let cancel = task_mgr::shutdown_token();

tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => {}
};

pageserver::consumption_metrics::collect_metrics(
metric_collection_endpoint,
&conf.metric_collection_bucket,
conf.metric_collection_interval,
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
cancel,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))
.await?;
Ok(())
{
let tenant_manager = tenant_manager.clone();
async move {
// first wait until background jobs are cleared to launch.
//
// this is because we only process active tenants and timelines, and the
// Timeline::get_current_logical_size will spawn the logical size calculation,
// which will not be rate-limited.
let cancel = task_mgr::shutdown_token();

tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => {}
};

pageserver::consumption_metrics::collect_metrics(
tenant_manager,
metric_collection_endpoint,
&conf.metric_collection_bucket,
conf.metric_collection_interval,
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
cancel,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))
.await?;
Ok(())
}
},
);
}
Expand Down
25 changes: 25 additions & 0 deletions pageserver/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ pub mod defaults {

pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

///
/// Default built-in configuration file.
///
Expand Down Expand Up @@ -156,6 +158,8 @@ pub mod defaults {
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
[remote_storage]
"#
Expand Down Expand Up @@ -279,6 +283,13 @@ pub struct PageServerConf {
pub max_vectored_read_bytes: MaxVectoredReadBytes,

pub validate_vectored_get: bool,

/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
/// of ephemeral data.
///
/// Setting this to zero disables limits on total ephemeral layer size.
pub ephemeral_bytes_per_memory_kb: usize,
}

/// We do not want to store this in a PageServerConf because the latter may be logged
Expand Down Expand Up @@ -400,6 +411,8 @@ struct PageServerConfigBuilder {
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

validate_vectored_get: BuilderValue<bool>,

ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
}

impl PageServerConfigBuilder {
Expand Down Expand Up @@ -486,6 +499,7 @@ impl PageServerConfigBuilder {
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
)),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
}
}
}
Expand Down Expand Up @@ -665,6 +679,10 @@ impl PageServerConfigBuilder {
self.validate_vectored_get = BuilderValue::Set(value);
}

pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
}

pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();

Expand Down Expand Up @@ -720,6 +738,7 @@ impl PageServerConfigBuilder {
get_vectored_impl,
max_vectored_read_bytes,
validate_vectored_get,
ephemeral_bytes_per_memory_kb,
}
CUSTOM LOGIC
{
Expand Down Expand Up @@ -1010,6 +1029,9 @@ impl PageServerConf {
"validate_vectored_get" => {
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
}
"ephemeral_bytes_per_memory_kb" => {
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
Expand Down Expand Up @@ -1091,6 +1113,7 @@ impl PageServerConf {
.expect("Invalid default constant"),
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
}
}
}
Expand Down Expand Up @@ -1328,6 +1351,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant")
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
},
"Correct defaults should be used when no config values are provided"
);
Expand Down Expand Up @@ -1399,6 +1423,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant")
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
},
"Should be able to parse all basic config values correctly"
);
Expand Down
Loading

1 comment on commit c431e2f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2730 tests run: 2592 passed, 0 failed, 138 skipped (full report)


Code coverage* (full report)

  • functions: 28.2% (6308 of 22367 functions)
  • lines: 47.0% (44296 of 94303 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
c431e2f at 2024-03-27T19:37:01.740Z :recycle:

Please sign in to comment.