Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DO NOT MERGE Attempting to make the rest server more stable under load of an #4946

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ tower = { version = "0.4.13", features = [
"load",
"retry",
"util",
"load-shed",
] }
tower-http = { version = "0.4.0", features = ["compression-gzip", "cors"] }
tracing = "0.1.37"
Expand Down
2 changes: 1 addition & 1 deletion quickwit/quickwit-search/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pub use crate::search_job_placer::{Job, SearchJobPlacer};
pub use crate::search_response_rest::SearchResponseRest;
pub use crate::search_stream::root_search_stream;
pub use crate::service::{MockSearchService, SearchService, SearchServiceImpl};
use crate::thread_pool::run_cpu_intensive;
pub use crate::thread_pool::run_cpu_intensive;

/// A pool of searcher clients identified by their gRPC socket address.
pub type SearcherPool = Pool<SocketAddr, SearchServiceClient>;
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-serve/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ humantime = { workspace = true }
hyper = { workspace = true }
itertools = { workspace = true }
mime_guess = { workspace = true }
rayon = { workspace = true }
num_cpus = { workspace = true }
once_cell = { workspace = true }
opentelemetry = { workspace = true }
Expand Down
53 changes: 50 additions & 3 deletions quickwit/quickwit-serve/src/decompression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,59 @@ use std::io::Read;

use bytes::Bytes;
use flate2::read::GzDecoder;
use once_cell::sync::OnceCell;
use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS};
use thiserror::Error;
use tokio::task;
use tracing::error;
use warp::reject::Reject;
use warp::Filter;

fn thread_pool() -> &'static rayon::ThreadPool {
static THREAD_POOL: OnceCell<rayon::ThreadPool> = OnceCell::new();
THREAD_POOL.get_or_init(|| {
rayon::ThreadPoolBuilder::new()
.num_threads(1)
.thread_name(|thread_id| format!("quickwit-rest-{thread_id}"))
.panic_handler(|_my_panic| {
error!("task running in the quickwit rest pool panicked");
})
.build()
.expect("Failed to spawn the spawning pool")
})
}

/// Function similar to `tokio::spawn_blocking`.
///
/// Here are two important differences however:
///
/// 1) The task is running on a rayon thread pool managed by quickwit.
/// This pool is specifically used only to run CPU intensive work
/// and is configured to contain `num_cpus` cores.
///
/// 2) Before the task is effectively scheduled, we check that
/// the spawner is still interested by its result.
///
/// It is therefore required to `await` the result of this
/// function to get anywork done.
///
/// This is nice, because it makes work that has been scheduled
/// but is not running yet "cancellable".
pub async fn run_cpu_intensive<F, R>(cpu_heavy_task: F) -> Result<R, ()>
where
F: FnOnce() -> R + Send + 'static,
R: Send + 'static,
{
let (tx, rx) = tokio::sync::oneshot::channel();
thread_pool().spawn(move || {
if tx.is_closed() {
return;
}
let task_result = cpu_heavy_task();
let _ = tx.send(task_result);
});
rx.await.map_err(|_| ())
}

/// There are two ways to decompress the body:
/// - Stream the body through an async decompressor
/// - Fetch the body and then decompress the bytes
Expand All @@ -37,7 +84,7 @@ use warp::Filter;
async fn decompress_body(encoding: Option<String>, body: Bytes) -> Result<Bytes, warp::Rejection> {
match encoding.as_deref() {
Some("gzip" | "x-gzip") => {
let decompressed = task::spawn_blocking(move || {
let decompressed = run_cpu_intensive(move || {
let mut decompressed = Vec::new();
let mut decoder = GzDecoder::new(body.as_ref());
decoder
Expand All @@ -50,7 +97,7 @@ async fn decompress_body(encoding: Option<String>, body: Bytes) -> Result<Bytes,
Ok(decompressed)
}
Some("zstd") => {
let decompressed = task::spawn_blocking(move || {
let decompressed = run_cpu_intensive(move || {
zstd::decode_all(body.as_ref())
.map(Bytes::from)
.map_err(|_| warp::reject::custom(CorruptedData))
Expand Down
6 changes: 4 additions & 2 deletions quickwit/quickwit-serve/src/elasticsearch_api/bulk_v2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
// along with this program. If not, see <http://www.gnu.org/licenses/>.

use std::collections::HashMap;
use std::time::Instant;
use std::time::{Duration, Instant};

use elasticsearch_dsl::ErrorCause;
use hyper::StatusCode;
use quickwit_config::INGEST_V2_SOURCE_ID;
use quickwit_ingest::IngestRequestV2Builder;
Expand Down Expand Up @@ -130,7 +131,8 @@ pub(crate) async fn elastic_bulk_ingest_v2(
let Some(ingest_request) = ingest_request_opt else {
return Ok(ElasticBulkResponse::default());
};
let ingest_response_v2 = ingest_router.ingest(ingest_request).await?;
let ingest_response_v2 = tokio::time::timeout(Duration::from_millis(500), ingest_router.ingest(ingest_request)).await
.map_err(|_| ElasticsearchError::new(StatusCode::REQUEST_TIMEOUT, "router timeout".to_string(), None))??;
let errors = !ingest_response_v2.failures.is_empty();
let mut items = Vec::new();

Expand Down
5 changes: 5 additions & 0 deletions quickwit/quickwit-serve/src/rest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;

use hyper::http::HeaderValue;
use hyper::{http, Method, StatusCode};
Expand Down Expand Up @@ -132,6 +133,10 @@ pub(crate) async fn start_rest_server(
let cors = build_cors(&quickwit_services.node_config.rest_config.cors_allow_origins);

let service = ServiceBuilder::new()
.timeout(Duration::from_millis(500)) // TO NOT MERGE THIS, THIS IS JUST FOR A TEST.
.concurrency_limit(quickwit_common::get_from_env("QW_REST_CONCURRENCY_LIMIT", 5))
.load_shed()
.concurrency_limit(quickwit_common::get_from_env("QW_REST_LOAD_SHED_LIMIT", 30))
.layer(
CompressionLayer::new()
.gzip(true)
Expand Down