lambdaclass
diff --git a/‎cmd/ethrex/initializers.rs‎
Lines changed: 2 additions & 0 deletions b/‎cmd/ethrex/initializers.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎crates/blockchain/metrics/api.rs‎
Lines changed: 4 additions & 5 deletions b/‎crates/blockchain/metrics/api.rs‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎crates/blockchain/metrics/mod.rs‎
Lines changed: 23 additions & 0 deletions b/‎crates/blockchain/metrics/mod.rs‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎crates/blockchain/metrics/profiling.rs‎
Lines changed: 5 additions & 43 deletions b/‎crates/blockchain/metrics/profiling.rs‎
Lines changed: 5 additions & 43 deletions
diff --git a/‎crates/blockchain/metrics/rpc.rs‎
Lines changed: 85 additions & 0 deletions b/‎crates/blockchain/metrics/rpc.rs‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎crates/networking/rpc/rpc.rs‎
Lines changed: 37 additions & 5 deletions b/‎crates/networking/rpc/rpc.rs‎
Lines changed: 37 additions & 5 deletions
diff --git a/‎docs/developers/l1/dashboards.md‎
Lines changed: 32 additions & 5 deletions b/‎docs/developers/l1/dashboards.md‎
Lines changed: 32 additions & 5 deletions
diff --git a/‎docs/developers/l1/img/engine_and_rpc_error_rates_row.png‎
149 KB b/‎docs/developers/l1/img/engine_and_rpc_error_rates_row.png‎
149 KB
diff --git a/‎docs/developers/l1/img/engine_and_rpc_errors_by_method_and_kind.png‎
94.4 KB b/‎docs/developers/l1/img/engine_and_rpc_errors_by_method_and_kind.png‎
94.4 KB
diff --git a/‎docs/developers/l1/img/engine_api_row.png‎
58.1 KB b/‎docs/developers/l1/img/engine_api_row.png‎
58.1 KB
@@ -11,6 +11,7 @@ use ethrex_common::types::Genesis;
 use ethrex_config::networks::Network;
 
 use ethrex_metrics::profiling::{FunctionProfilingLayer, initialize_block_processing_profile};
+use ethrex_metrics::rpc::initialize_rpc_metrics;
 use ethrex_p2p::rlpx::initiator::RLPxInitiator;
 use ethrex_p2p::{
     discv4::peer_table::PeerTable,
@@ -89,6 +90,7 @@ pub fn init_metrics(opts: &Options, tracker: TaskTracker) {
     );
 
     initialize_block_processing_profile();
+    initialize_rpc_metrics();
 
     tracker.spawn(metrics_api);
 }
 
@@ -1,9 +1,8 @@
 use axum::{Router, routing::get};
 
-use crate::profiling::gather_profiling_metrics;
-
 use crate::{
-    MetricsApiError, blocks::METRICS_BLOCKS, process::METRICS_PROCESS, transactions::METRICS_TX,
+    MetricsApiError, blocks::METRICS_BLOCKS, gather_default_metrics, process::METRICS_PROCESS,
+    transactions::METRICS_TX,
 };
 
 pub async fn start_prometheus_metrics_api(
@@ -32,10 +31,10 @@ pub(crate) async fn get_metrics() -> String {
     };
 
     ret_string.push('\n');
-    match gather_profiling_metrics() {
+    match gather_default_metrics() {
         Ok(string) => ret_string.push_str(&string),
         Err(_) => {
-            tracing::error!("Failed to register METRICS_PROFILING");
+            tracing::error!("Failed to gather default Prometheus metrics");
             return String::new();
         }
     };
 
@@ -8,6 +8,8 @@ pub mod l2;
 pub mod process;
 #[cfg(feature = "api")]
 pub mod profiling;
+#[cfg(feature = "api")]
+pub mod rpc;
 #[cfg(any(feature = "api", feature = "transactions"))]
 pub mod transactions;
 
@@ -70,3 +72,24 @@ pub enum MetricsError {
     #[error("MetricsL2Error {0}")]
     FromUtf8Error(#[from] std::string::FromUtf8Error),
 }
+
+#[cfg(feature = "api")]
+/// Returns all metrics currently registered in Prometheus' default registry.
+///
+/// Both profiling and RPC metrics register with this default registry, and the
+/// metrics API surfaces them by calling this helper.
+pub fn gather_default_metrics() -> Result<String, MetricsError> {
+    use prometheus::{Encoder, TextEncoder};
+
+    let encoder = TextEncoder::new();
+    let metric_families = prometheus::gather();
+
+    let mut buffer = Vec::new();
+    encoder
+        .encode(&metric_families, &mut buffer)
+        .map_err(|e| MetricsError::PrometheusErr(e.to_string()))?;
+
+    let res = String::from_utf8(buffer)?;
+
+    Ok(res)
+}
@@ -1,17 +1,18 @@
-use prometheus::{Encoder, HistogramTimer, HistogramVec, TextEncoder, register_histogram_vec};
-use std::{future::Future, sync::LazyLock};
+use prometheus::{HistogramTimer, HistogramVec, register_histogram_vec};
+use std::sync::LazyLock;
 use tracing::{
     Subscriber,
     field::{Field, Visit},
     span::{Attributes, Id},
 };
 use tracing_subscriber::{Layer, layer::Context, registry::LookupSpan};
 
-use crate::MetricsError;
-
 pub static METRICS_BLOCK_PROCESSING_PROFILE: LazyLock<HistogramVec> =
     LazyLock::new(initialize_histogram_vec);
 
+// Metrics defined in this module register into the Prometheus default registry.
+// The metrics API exposes them by calling `gather_default_metrics()`.
+
 fn initialize_histogram_vec() -> HistogramVec {
     register_histogram_vec!(
         "function_duration_seconds",
@@ -111,45 +112,6 @@ where
     }
 }
 
-/// Records the duration of an async operation in the function profiling histogram.
-///
-/// This provides a lightweight alternative to the `#[instrument]` attribute when you need
-/// manual control over timing instrumentation, such as in RPC handlers.
-///
-/// # Parameters
-/// * `namespace` - Category for the metric (e.g., "rpc", "engine", "block_execution")
-/// * `function_name` - Name identifier for the operation being timed
-/// * `future` - The async operation to time
-///
-/// Use this function when you need to instrument an async operation for duration metrics,
-/// but cannot or do not want to use the `#[instrument]` attribute (for example, in RPC handlers).
-pub async fn record_async_duration<Fut, T>(namespace: &str, function_name: &str, future: Fut) -> T
-where
-    Fut: Future<Output = T>,
-{
-    let timer = METRICS_BLOCK_PROCESSING_PROFILE
-        .with_label_values(&[namespace, function_name])
-        .start_timer();
-
-    let output = future.await;
-    timer.observe_duration();
-    output
-}
-
-pub fn gather_profiling_metrics() -> Result<String, MetricsError> {
-    let encoder = TextEncoder::new();
-    let metric_families = prometheus::gather();
-
-    let mut buffer = Vec::new();
-    encoder
-        .encode(&metric_families, &mut buffer)
-        .map_err(|e| MetricsError::PrometheusErr(e.to_string()))?;
-
-    let res = String::from_utf8(buffer)?;
-
-    Ok(res)
-}
-
 pub fn initialize_block_processing_profile() {
     METRICS_BLOCK_PROCESSING_PROFILE.reset();
 }
@@ -0,0 +1,85 @@
+use prometheus::{CounterVec, HistogramVec, register_counter_vec, register_histogram_vec};
+use std::{future::Future, sync::LazyLock};
+
+pub static METRICS_RPC_REQUEST_OUTCOMES: LazyLock<CounterVec> =
+    LazyLock::new(initialize_rpc_outcomes_counter);
+
+pub static METRICS_RPC_DURATION: LazyLock<HistogramVec> =
+    LazyLock::new(initialize_rpc_duration_histogram);
+
+// Metrics defined in this module register into the Prometheus default registry.
+// The metrics API exposes them by calling `gather_default_metrics()`.
+
+fn initialize_rpc_outcomes_counter() -> CounterVec {
+    register_counter_vec!(
+        "rpc_requests_total",
+        "Total number of RPC requests partitioned by namespace, method, and outcome",
+        &["namespace", "method", "outcome", "error_kind"],
+    )
+    .unwrap()
+}
+
+fn initialize_rpc_duration_histogram() -> HistogramVec {
+    register_histogram_vec!(
+        "rpc_request_duration_seconds",
+        "Histogram of RPC request handling duration partitioned by namespace and method",
+        &["namespace", "method"],
+    )
+    .unwrap()
+}
+
+/// Represents the outcome of an RPC request when recording metrics.
+#[derive(Clone)]
+pub enum RpcOutcome {
+    Success,
+    Error(&'static str),
+}
+
+impl RpcOutcome {
+    fn as_label(&self) -> &'static str {
+        match self {
+            RpcOutcome::Success => "success",
+            RpcOutcome::Error(_) => "error",
+        }
+    }
+
+    fn error_kind(&self) -> &str {
+        match self {
+            RpcOutcome::Success => "",
+            RpcOutcome::Error(kind) => kind,
+        }
+    }
+}
+
+pub fn record_rpc_outcome(namespace: &str, method: &str, outcome: RpcOutcome) {
+    METRICS_RPC_REQUEST_OUTCOMES
+        .with_label_values(&[namespace, method, outcome.as_label(), outcome.error_kind()])
+        .inc();
+}
+
+pub fn initialize_rpc_metrics() {
+    METRICS_RPC_REQUEST_OUTCOMES.reset();
+    METRICS_RPC_DURATION.reset();
+}
+
+/// Records the duration of an async operation in the RPC request duration histogram.
+///
+/// This provides a lightweight alternative to the `#[instrument]` attribute.
+///
+/// # Parameters
+/// * `namespace` - Category for the metric (e.g., "rpc", "engine", "block_execution")
+/// * `method` - Name identifier for the operation being timed
+/// * `future` - The async operation to time
+///
+pub async fn record_async_duration<Fut, T>(namespace: &str, method: &str, future: Fut) -> T
+where
+    Fut: Future<Output = T>,
+{
+    let timer = METRICS_RPC_DURATION
+        .with_label_values(&[namespace, method])
+        .start_timer();
+
+    let output = future.await;
+    timer.observe_duration();
+    output
+}
@@ -55,7 +55,7 @@ use bytes::Bytes;
 use ethrex_blockchain::Blockchain;
 use ethrex_blockchain::error::ChainError;
 use ethrex_common::types::Block;
-use ethrex_metrics::profiling::record_async_duration;
+use ethrex_metrics::rpc::{RpcOutcome, record_async_duration, record_rpc_outcome};
 use ethrex_p2p::peer_handler::PeerHandler;
 use ethrex_p2p::sync_manager::SyncManager;
 use ethrex_p2p::types::Node;
@@ -196,16 +196,48 @@ pub trait RpcHandler: Sized {
             Ok(RpcNamespace::Engine) => "engine",
             _ => "rpc",
         };
+        let method = req.method.as_str();
+
+        let result =
+            record_async_duration(
+                namespace,
+                method,
+                async move { request.handle(context).await },
+            )
+            .await;
+
+        let outcome = match &result {
+            Ok(_) => RpcOutcome::Success,
+            Err(err) => RpcOutcome::Error(get_error_kind(err)),
+        };
+        record_rpc_outcome(namespace, method, outcome);
 
-        record_async_duration(namespace, req.method.as_str(), async move {
-            request.handle(context).await
-        })
-        .await
+        result
     }
 
     async fn handle(&self, context: RpcApiContext) -> Result<Value, RpcErr>;
 }
 
+fn get_error_kind(err: &RpcErr) -> &'static str {
+    match err {
+        RpcErr::MethodNotFound(_) => "MethodNotFound",
+        RpcErr::WrongParam(_) => "WrongParam",
+        RpcErr::BadParams(_) => "BadParams",
+        RpcErr::MissingParam(_) => "MissingParam",
+        RpcErr::TooLargeRequest => "TooLargeRequest",
+        RpcErr::BadHexFormat(_) => "BadHexFormat",
+        RpcErr::UnsuportedFork(_) => "UnsuportedFork",
+        RpcErr::Internal(_) => "Internal",
+        RpcErr::Vm(_) => "Vm",
+        RpcErr::Revert { .. } => "Revert",
+        RpcErr::Halt { .. } => "Halt",
+        RpcErr::AuthenticationError(_) => "AuthenticationError",
+        RpcErr::InvalidForkChoiceState(_) => "InvalidForkChoiceState",
+        RpcErr::InvalidPayloadAttributes(_) => "InvalidPayloadAttributes",
+        RpcErr::UnknownPayload(_) => "UnknownPayload",
+    }
+}
+
 pub const FILTER_DURATION: Duration = {
     if cfg!(test) {
         Duration::from_secs(1)
 
@@ -94,16 +94,21 @@ Collapsed row that surfaces the `namespace="engine"` Prometheus timers so you ca
 
 ![Engine API row](img/engine_api_row.png)
 
-### Engine Request Rate by Method
-Shows how many Engine API calls per second we process, split by JSON-RPC method and averaged across the currently selected dashboard range.
+### Engine Total Time per Method
+Pie chart that shows where Engine time is spent across methods over the selected range. Quickly surfaces which endpoints dominate total processing time.
 
-![Engine Request Rate by Method](img/engine_request_rate_by_method.png)
+![Engine Total Time per Method](img/engine_total_time_per_method.png)
 
 ### Engine Latency by Methods (Avg Duration)
 Bar gauge of the historical average latency per Engine method over the selected time range.
 
 ![Engine Latency by Methods](img/engine_latency_by_methods.png)
 
+### Engine Request Rate by Method
+Shows how many Engine API calls per second we process, split by JSON-RPC method and averaged across the currently selected dashboard range.
+
+![Engine Request Rate by Method](img/engine_request_rate_by_method.png)
+
 ### Engine Latency by Method
 Live timeseries that tries to correlate to the per-block execution time by showing real-time latency per Engine method with an 18 s lookback window.
 
@@ -117,10 +122,10 @@ Another collapsed row focused on the public JSON-RPC surface (`namespace="rpc"`)
 
 ![RPC API row](img/rpc_api_row.png)
 
-### RPC Time per Method
+### RPC Total Time per Method
 Pie chart that shows where RPC time is spent across methods over the selected range. Quickly surfaces which endpoints dominate total processing time.
 
-![RPC Time per Method](img/rpc_time_per_method.png)
+![RPC Total Time per Method](img/rpc_total_time_per_method.png)
 
 ### Slowest RPC Methods
 Table listing the highest average-latency methods over the active dashboard range. Used to prioritise optimisation or caching efforts.
@@ -139,6 +144,28 @@ Live timeseries that tries to correlate to the per-block execution time by showi
 
 _**Limitations**: The RPC latency views inherit the same windowing caveats as the Engine charts: averages use the dashboard time range while the live chart relies on an 18 s window._
 
+## Engine and RPC Error rates
+
+Collapsed row showing error rates for both Engine and RPC APIs side by side and a deagreagated panel by method and kind of error. Each panel repeats per instance to be able to compare behaviour across nodes.
+
+![Engine and RPC Error rates row](img/engine_and_rpc_error_rates_row.png)
+
+### Engine Success/Error Rate
+Shows the rate of successful vs. failed Engine API requests per second.
+
+![Engine Success/Error Rate](img/engine_success_error_rate.png)
+
+### RPC Success/Error Rate
+Shows the rate of successful vs. failed RPC API requests per second.
+
+![RPC Success/Error Rate](img/rpc_success_error_rate.png)
+
+### Engine and RPC Errors % by Method and Kind
+
+Deaggregated view of error percentages split by method and error kind for both Engine and RPC APIs. The % are calculated against total requests for a particular method, so all different error percentage for a method should sum up to the percentage of errors for that method.
+
+![Engine and RPC Errors % by Method and Kind](img/engine_and_rpc_errors_by_method_and_kind.png)
+
 ## Process and server info
 
 Row panels showing process-level and host-level metrics to help you monitor resource usage and spot potential issues.