Add more Prometheus metrics for memory and Tokio tasks (#5282)

* Add more Prometheus metrics for memory and Tokio tasks

* pr comments
This commit is contained in:
aecsocket
2026-02-03 19:05:34 +00:00
committed by GitHub
parent ab753a82bc
commit 5d6593a9da
5 changed files with 171 additions and 85 deletions

View File

@@ -325,16 +325,6 @@ pub fn app_config(
.app_data(web::Data::new(labrinth_config.stripe_client.clone())) .app_data(web::Data::new(labrinth_config.stripe_client.clone()))
.app_data(web::Data::new(labrinth_config.anrok_client.clone())) .app_data(web::Data::new(labrinth_config.anrok_client.clone()))
.app_data(labrinth_config.rate_limiter.clone()) .app_data(labrinth_config.rate_limiter.clone())
.configure({
#[cfg(target_os = "linux")]
{
|cfg| routes::debug::config(cfg)
}
#[cfg(not(target_os = "linux"))]
{
|_cfg| ()
}
})
.configure(routes::v2::config) .configure(routes::v2::config)
.configure(routes::v3::config) .configure(routes::v3::config)
.configure(routes::internal::config) .configure(routes::internal::config)
@@ -346,8 +336,18 @@ pub fn utoipa_app_config(
cfg: &mut utoipa_actix_web::service_config::ServiceConfig, cfg: &mut utoipa_actix_web::service_config::ServiceConfig,
_labrinth_config: LabrinthConfig, _labrinth_config: LabrinthConfig,
) { ) {
cfg.configure(routes::v3::utoipa_config) cfg.configure({
.configure(routes::internal::utoipa_config); #[cfg(target_os = "linux")]
{
|cfg| routes::debug::config(cfg)
}
#[cfg(not(target_os = "linux"))]
{
|_cfg| ()
}
})
.configure(routes::v3::utoipa_config)
.configure(routes::internal::utoipa_config);
} }
// This is so that env vars not used immediately don't panic at runtime // This is so that env vars not used immediately don't panic at runtime

View File

@@ -206,9 +206,8 @@ async fn app() -> std::io::Result<()> {
.await .await
.expect("Failed to register redis metrics"); .expect("Failed to register redis metrics");
#[cfg(target_os = "linux")] labrinth::routes::debug::register_and_set_metrics(&prometheus.registry)
labrinth::routes::debug::jemalloc_memory_stats(&prometheus.registry) .expect("Failed to register debug metrics");
.expect("Failed to register jemalloc metrics");
let labrinth_config = labrinth::app_setup( let labrinth_config = labrinth::app_setup(
pool.clone(), pool.clone(),

View File

@@ -1,83 +1,71 @@
use crate::routes::ApiError;
use crate::util::cors::default_cors;
use crate::util::guards::admin_key_guard;
use actix_web::{HttpResponse, get};
use prometheus::{IntGauge, Registry};
use std::time::Duration; use std::time::Duration;
pub fn config(cfg: &mut actix_web::web::ServiceConfig) { use eyre::Context;
use eyre::eyre;
use prometheus::IntGauge;
use crate::util::cors::default_cors;
#[cfg(target_os = "linux")]
mod pprof;
pub fn config(cfg: &mut utoipa_actix_web::service_config::ServiceConfig) {
cfg.service( cfg.service(
actix_web::web::scope("/debug") utoipa_actix_web::scope("/debug")
.wrap(default_cors()) .wrap(default_cors())
.service(heap) .configure({
.service(flame_graph), #[cfg(target_os = "linux")]
{
pprof::config
}
#[cfg(not(target_os = "linux"))]
{
|_cfg| ()
}
}),
); );
} }
#[get("pprof/heap", guard = "admin_key_guard")] pub fn register_and_set_metrics(
pub async fn heap() -> Result<HttpResponse, ApiError> { registry: &prometheus::Registry,
let mut prof_ctl = jemalloc_pprof::PROF_CTL.as_ref().unwrap().lock().await; ) -> eyre::Result<()> {
require_profiling_activated(&prof_ctl)?; #[cfg(target_os = "linux")]
let pprof = prof_ctl {
.dump_pprof() pprof::register_and_set_metrics(registry)
.map_err(|err| ApiError::InvalidInput(err.to_string()))?; .wrap_err("failed to register jemalloc metrics")?;
Ok(HttpResponse::Ok()
.content_type("application/octet-stream")
.body(pprof))
}
#[get("pprof/heap/flamegraph", guard = "admin_key_guard")]
pub async fn flame_graph() -> Result<HttpResponse, ApiError> {
let mut prof_ctl = jemalloc_pprof::PROF_CTL.as_ref().unwrap().lock().await;
require_profiling_activated(&prof_ctl)?;
let svg = prof_ctl
.dump_flamegraph()
.map_err(|err| ApiError::InvalidInput(err.to_string()))?;
Ok(HttpResponse::Ok().content_type("image/svg+xml").body(svg))
}
fn require_profiling_activated(
prof_ctl: &jemalloc_pprof::JemallocProfCtl,
) -> Result<(), ApiError> {
if prof_ctl.activated() {
Ok(())
} else {
Err(ApiError::InvalidInput(
"Profiling is not activated".to_string(),
))
} }
}
pub fn jemalloc_memory_stats( let make_gauge = |key: &str, name: &str| {
registry: &Registry, IntGauge::new(key, name)
) -> Result<(), prometheus::Error> { .wrap_err_with(|| eyre!("failed to create gauge for '{key}'"))
let allocated_mem = IntGauge::new( };
"labrinth_memory_allocated",
"labrinth allocated memory", let num_workers = make_gauge(
"labrinth_tokio_num_workers",
"number of Tokio worker threads, excluding Actix HTTP server threads",
)?;
let num_alive_tasks = make_gauge(
"labrinth_tokio_num_alive_tasks",
"number of alive Tokio tasks, excluding Actix HTTP server tasks",
)?;
let global_queue_depth = make_gauge(
"labrinth_tokio_global_queue_depth",
"number of tasks in the global queue, excluding Actix runtime",
)?; )?;
let resident_mem =
IntGauge::new("labrinth_resident_memory", "labrinth resident memory")?;
registry.register(Box::new(allocated_mem.clone()))?; for gauge in [&num_workers, &num_alive_tasks, &global_queue_depth] {
registry.register(Box::new(resident_mem.clone()))?; registry
.register(Box::new(gauge.clone()))
.wrap_err("failed to register gauge")?;
}
tokio::spawn(async move { tokio::spawn(async move {
let e = tikv_jemalloc_ctl::epoch::mib().unwrap(); let metrics = tokio::runtime::Handle::current().metrics();
let allocated = tikv_jemalloc_ctl::stats::allocated::mib().unwrap();
let resident = tikv_jemalloc_ctl::stats::resident::mib().unwrap();
loop { loop {
e.advance().unwrap(); num_workers.set(metrics.num_workers() as i64);
num_alive_tasks.set(metrics.num_alive_tasks() as i64);
if let Ok(allocated) = allocated.read() { global_queue_depth.set(metrics.global_queue_depth() as i64);
allocated_mem.set(allocated as i64);
}
if let Ok(resident) = resident.read() {
resident_mem.set(resident as i64);
}
tokio::time::sleep(Duration::from_secs(5)).await; tokio::time::sleep(Duration::from_secs(5)).await;
} }

View File

@@ -0,0 +1,102 @@
use crate::routes::ApiError;
use crate::util::guards::admin_key_guard;
use actix_web::{HttpResponse, get};
use eyre::{Context, eyre};
use prometheus::{IntGauge, Registry};
use std::time::Duration;
pub fn config(cfg: &mut utoipa_actix_web::service_config::ServiceConfig) {
cfg.service(heap).service(flame_graph);
}
#[utoipa::path]
#[get("/pprof/heap", guard = "admin_key_guard")]
pub async fn heap() -> Result<HttpResponse, ApiError> {
let mut prof_ctl = jemalloc_pprof::PROF_CTL.as_ref().unwrap().lock().await;
require_profiling_activated(&prof_ctl)?;
let pprof = prof_ctl
.dump_pprof()
.map_err(|err| ApiError::InvalidInput(err.to_string()))?;
Ok(HttpResponse::Ok()
.content_type("application/octet-stream")
.body(pprof))
}
#[utoipa::path]
#[get("/pprof/heap/flamegraph", guard = "admin_key_guard")]
pub async fn flame_graph() -> Result<HttpResponse, ApiError> {
let mut prof_ctl = jemalloc_pprof::PROF_CTL.as_ref().unwrap().lock().await;
require_profiling_activated(&prof_ctl)?;
let svg = prof_ctl
.dump_flamegraph()
.map_err(|err| ApiError::InvalidInput(err.to_string()))?;
Ok(HttpResponse::Ok().content_type("image/svg+xml").body(svg))
}
fn require_profiling_activated(
prof_ctl: &jemalloc_pprof::JemallocProfCtl,
) -> Result<(), ApiError> {
if prof_ctl.activated() {
Ok(())
} else {
Err(ApiError::InvalidInput(
"Profiling is not activated".to_string(),
))
}
}
pub fn register_and_set_metrics(registry: &Registry) -> eyre::Result<()> {
let make_gauge = |key: &str, name: &str| {
IntGauge::new(key, name)
.wrap_err_with(|| eyre!("failed to create gauge for '{key}'"))
};
let active_mem =
make_gauge("labrinth_memory_active", "labrinth active memory")?;
let allocated_mem =
make_gauge("labrinth_memory_allocated", "labrinth allocated memory")?;
let mapped_mem =
make_gauge("labrinth_memory_mapped", "labrinth mapped memory")?;
let metadata_mem =
make_gauge("labrinth_memory_metadata", "labrinth metadata memory")?;
let resident_mem =
make_gauge("labrinth_memory_resident", "labrinth resident memory")?;
for gauge in [
&active_mem,
&allocated_mem,
&mapped_mem,
&metadata_mem,
&resident_mem,
] {
registry
.register(Box::new(gauge.clone()))
.wrap_err("failed to register gauge")?;
}
tokio::spawn(async move {
let epoch =
tikv_jemalloc_ctl::epoch::mib().expect("failed to get epoch");
let active = tikv_jemalloc_ctl::stats::active::mib().unwrap();
let allocated = tikv_jemalloc_ctl::stats::allocated::mib().unwrap();
let mapped = tikv_jemalloc_ctl::stats::mapped::mib().unwrap();
let metadata = tikv_jemalloc_ctl::stats::metadata::mib().unwrap();
let resident = tikv_jemalloc_ctl::stats::resident::mib().unwrap();
loop {
epoch.advance().unwrap();
_ = active.read().inspect(|x| active_mem.set(*x as i64));
_ = allocated.read().inspect(|x| allocated_mem.set(*x as i64));
_ = mapped.read().inspect(|x| mapped_mem.set(*x as i64));
_ = metadata.read().inspect(|x| metadata_mem.set(*x as i64));
_ = resident.read().inspect(|x| resident_mem.set(*x as i64));
tokio::time::sleep(Duration::from_secs(5)).await;
}
});
Ok(())
}

View File

@@ -10,14 +10,11 @@ use actix_web::{HttpResponse, web};
use futures::FutureExt; use futures::FutureExt;
use serde_json::json; use serde_json::json;
pub mod debug;
pub mod internal; pub mod internal;
pub mod v2; pub mod v2;
pub mod v3;
#[cfg(target_os = "linux")]
pub mod debug;
pub mod v2_reroute; pub mod v2_reroute;
pub mod v3;
mod analytics; mod analytics;
mod index; mod index;