From 3fd820942035f6b089157495b87bfa4118804860 Mon Sep 17 00:00:00 2001 From: Will Temple Date: Thu, 19 Mar 2026 17:54:29 -0400 Subject: [PATCH] Restaged repo, allocator and runtime implemented, ioring-backed async fs/net/channel/timer primitives --- .gitignore | 1 + Cargo.lock | 199 +++ Cargo.toml | 3 + lib/runtime/Cargo.toml | 13 + lib/runtime/examples/async_fs_showcase.rs | 81 ++ lib/runtime/examples/channel_showcase.rs | 160 +++ lib/runtime/examples/hyper_http_client.rs | 75 ++ lib/runtime/examples/runtime_loop_showcase.rs | 228 ++++ lib/runtime/src/channel/mod.rs | 4 + lib/runtime/src/channel/mpsc.rs | 575 +++++++++ lib/runtime/src/channel/oneshot.rs | 281 +++++ lib/runtime/src/fs.rs | 552 +++++++++ lib/runtime/src/lib.rs | 78 ++ lib/runtime/src/net.rs | 963 +++++++++++++++ lib/runtime/src/op/completion.rs | 147 +++ lib/runtime/src/op/fs.rs | 105 ++ lib/runtime/src/op/mod.rs | 8 + lib/runtime/src/op/net.rs | 69 ++ .../linux_x86_64/mesh_alloc/allocator.rs | 864 +++++++++++++ .../platform/linux_x86_64/mesh_alloc/arena.rs | 428 +++++++ .../linux_x86_64/mesh_alloc/bitmap.rs | 236 ++++ .../linux_x86_64/mesh_alloc/constants.rs | 19 + .../platform/linux_x86_64/mesh_alloc/fault.rs | 132 ++ .../linux_x86_64/mesh_alloc/global.rs | 453 +++++++ .../linux_x86_64/mesh_alloc/meshing.rs | 15 + .../linux_x86_64/mesh_alloc/miniheap.rs | 374 ++++++ .../platform/linux_x86_64/mesh_alloc/mod.rs | 45 + .../platform/linux_x86_64/mesh_alloc/page.rs | 94 ++ .../linux_x86_64/mesh_alloc/platform.rs | 168 +++ .../platform/linux_x86_64/mesh_alloc/pool.rs | 152 +++ .../linux_x86_64/mesh_alloc/raw_sys.rs | 218 ++++ .../platform/linux_x86_64/mesh_alloc/rng.rs | 96 ++ .../linux_x86_64/mesh_alloc/shuffle.rs | 204 ++++ .../linux_x86_64/mesh_alloc/size_map.rs | 73 ++ .../platform/linux_x86_64/mesh_alloc/span.rs | 45 + .../platform/linux_x86_64/mesh_alloc/stats.rs | 267 +++++ .../platform/linux_x86_64/mesh_alloc/sync.rs | 137 +++ .../mesh_alloc/thread_local_heap.rs | 124 ++ lib/runtime/src/platform/linux_x86_64/mod.rs | 4 + .../src/platform/linux_x86_64/reactor.rs | 340 ++++++ .../src/platform/linux_x86_64/runtime.rs | 1067 +++++++++++++++++ .../src/platform/linux_x86_64/uring.rs | 478 ++++++++ lib/runtime/src/platform/mod.rs | 2 + lib/runtime/src/sys/linux/channel.rs | 10 + lib/runtime/src/sys/linux/fs.rs | 586 +++++++++ lib/runtime/src/sys/linux/mod.rs | 5 + lib/runtime/src/sys/linux/net.rs | 974 +++++++++++++++ lib/runtime/src/sys/mod.rs | 4 + lib/runtime/src/time.rs | 175 +++ lib/runtime_proc_macros/Cargo.toml | 12 + lib/runtime_proc_macros/src/lib.rs | 128 ++ 51 files changed, 11471 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 lib/runtime/Cargo.toml create mode 100644 lib/runtime/examples/async_fs_showcase.rs create mode 100644 lib/runtime/examples/channel_showcase.rs create mode 100644 lib/runtime/examples/hyper_http_client.rs create mode 100644 lib/runtime/examples/runtime_loop_showcase.rs create mode 100644 lib/runtime/src/channel/mod.rs create mode 100644 lib/runtime/src/channel/mpsc.rs create mode 100644 lib/runtime/src/channel/oneshot.rs create mode 100644 lib/runtime/src/fs.rs create mode 100644 lib/runtime/src/lib.rs create mode 100644 lib/runtime/src/net.rs create mode 100644 lib/runtime/src/op/completion.rs create mode 100644 lib/runtime/src/op/fs.rs create mode 100644 lib/runtime/src/op/mod.rs create mode 100644 lib/runtime/src/op/net.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/allocator.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/arena.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/bitmap.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/constants.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/fault.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/global.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/meshing.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/miniheap.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/mod.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/page.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/platform.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/pool.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/raw_sys.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/rng.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/shuffle.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/size_map.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/span.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/stats.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/sync.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mesh_alloc/thread_local_heap.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/mod.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/reactor.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/runtime.rs create mode 100644 lib/runtime/src/platform/linux_x86_64/uring.rs create mode 100644 lib/runtime/src/platform/mod.rs create mode 100644 lib/runtime/src/sys/linux/channel.rs create mode 100644 lib/runtime/src/sys/linux/fs.rs create mode 100644 lib/runtime/src/sys/linux/mod.rs create mode 100644 lib/runtime/src/sys/linux/net.rs create mode 100644 lib/runtime/src/sys/mod.rs create mode 100644 lib/runtime/src/time.rs create mode 100644 lib/runtime_proc_macros/Cargo.toml create mode 100644 lib/runtime_proc_macros/src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..a991cc3 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,199 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "libc" +version = "0.2.183" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ruin-runtime" +version = "0.1.0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "libc", + "ruin-runtime-proc-macros", +] + +[[package]] +name = "ruin-runtime-proc-macros" +version = "0.1.0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tokio" +version = "1.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +dependencies = [ + "pin-project-lite", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..54eefb2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,3 @@ +[workspace] +resolver = "3" +members = ["lib/*"] diff --git a/lib/runtime/Cargo.toml b/lib/runtime/Cargo.toml new file mode 100644 index 0000000..d5624d5 --- /dev/null +++ b/lib/runtime/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "ruin-runtime" +version = "0.1.0" +edition = "2024" + +[dependencies] +hyper = { version = "1.8", default-features = false, features = ["client", "http1"] } +libc = "0.2" +ruin_runtime_proc_macros = { package = "ruin-runtime-proc-macros", path = "../runtime_proc_macros" } + +[dev-dependencies] +bytes = "1" +http-body-util = "0.1" diff --git a/lib/runtime/examples/async_fs_showcase.rs b/lib/runtime/examples/async_fs_showcase.rs new file mode 100644 index 0000000..c8cb703 --- /dev/null +++ b/lib/runtime/examples/async_fs_showcase.rs @@ -0,0 +1,81 @@ +use ruin_runtime::fs::{self, File}; +use std::path::PathBuf; + +fn preview(bytes: &[u8]) -> String { + String::from_utf8_lossy(bytes).replace('\n', "\\n") +} + +#[ruin_runtime::async_main] +async fn main() { + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let cargo_toml = manifest_dir.join("Cargo.toml"); + let src_dir = manifest_dir.join("src"); + + println!("manifest dir: {}", manifest_dir.display()); + + let cargo_meta = fs::metadata(&cargo_toml) + .await + .expect("Cargo.toml metadata should load"); + println!( + "Cargo.toml: {} bytes, file={}, empty={}", + cargo_meta.len(), + cargo_meta.is_file(), + cargo_meta.is_empty() + ); + + let mut file = File::open(&cargo_toml) + .await + .expect("Cargo.toml should open for reading"); + let file_meta = file + .metadata() + .await + .expect("opened file metadata should load"); + println!("opened file metadata size: {}", file_meta.len()); + + let mut sequential = vec![0; 96]; + let sequential_read = file + .read(&mut sequential) + .await + .expect("sequential read should succeed"); + sequential.truncate(sequential_read); + println!( + "sequential read ({sequential_read} bytes): {}", + preview(&sequential) + ); + + let cloned = file.try_clone().await.expect("file clone should succeed"); + let mut positioned = [0u8; 48]; + let positioned_read = cloned + .read_at(0, &mut positioned) + .await + .expect("positioned read should succeed"); + println!( + "positioned read ({positioned_read} bytes): {}", + preview(&positioned[..positioned_read]) + ); + + let cargo_text = fs::read_to_string(&cargo_toml) + .await + .expect("read_to_string should succeed"); + println!("Cargo.toml line count: {}", cargo_text.lines().count()); + + let mut dir = fs::read_dir(&src_dir) + .await + .expect("src directory should be readable"); + let mut entries = Vec::new(); + while let Some(entry) = dir + .next_entry() + .await + .expect("read_dir stream should succeed") + { + let metadata = entry.metadata().await.expect("entry metadata should load"); + let kind = if metadata.is_dir() { "dir" } else { "file" }; + entries.push((entry.file_name().to_string_lossy().into_owned(), kind)); + } + entries.sort_by(|left, right| left.0.cmp(&right.0)); + + println!("src entries:"); + for (name, kind) in entries.iter().take(8) { + println!(" - {name} ({kind})"); + } +} diff --git a/lib/runtime/examples/channel_showcase.rs b/lib/runtime/examples/channel_showcase.rs new file mode 100644 index 0000000..ca2d07f --- /dev/null +++ b/lib/runtime/examples/channel_showcase.rs @@ -0,0 +1,160 @@ +use ruin_runtime::channel::{mpsc, oneshot}; +use ruin_runtime::{queue_future, spawn_worker, time::sleep}; +use std::fmt; +use std::sync::OnceLock; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +static START: OnceLock = OnceLock::new(); +static ACTUAL_ORDER: AtomicUsize = AtomicUsize::new(1); + +macro_rules! log_event { + ($expected:literal, $($arg:tt)*) => {{ + log_event_impl($expected, format_args!($($arg)*)); + }}; +} + +fn log_event_impl(expected: usize, message: fmt::Arguments<'_>) { + let actual = ACTUAL_ORDER.fetch_add(1, Ordering::SeqCst); + let elapsed = START + .get() + .expect("showcase start time should be initialized") + .elapsed() + .as_millis(); + println!( + "[actual {actual:02} | expected {expected:02} | +{elapsed:04}ms | ts {}] {message}", + unix_timestamp_millis(), + ); +} + +fn unix_timestamp_millis() -> String { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system clock should be after the Unix epoch"); + format!("{}.{:03}", now.as_secs(), now.subsec_millis()) +} + +enum WorkerEvent { + Log(String), + PresentRequest { + frame: &'static str, + ack: oneshot::Sender<&'static str>, + }, +} + +#[ruin_runtime::async_main] +async fn main() -> Result<(), Box> { + START.get_or_init(Instant::now); + + let (job_tx, mut job_rx) = mpsc::channel::<&'static str>(1); + let (event_tx, mut event_rx) = mpsc::unbounded_channel::(); + + let worker = spawn_worker( + move || { + queue_future(async move { + while let Some(job) = job_rx.recv().await { + event_tx + .send(WorkerEvent::Log(format!( + "[worker] accepted job `{job}` from main thread" + ))) + .unwrap_or_else(|_| { + panic!("worker should be able to report accepted jobs") + }); + + sleep(Duration::from_millis(20)).await; + if job == "upload-frame" { + let (ack_tx, mut ack_rx) = oneshot::channel(); + event_tx + .send(WorkerEvent::PresentRequest { + frame: job, + ack: ack_tx, + }) + .unwrap_or_else(|_| { + panic!("worker should be able to request presentation") + }); + let ack = ack_rx + .recv() + .await + .expect("main thread should acknowledge frame"); + event_tx + .send(WorkerEvent::Log(format!( + "[worker] got oneshot ack `{ack}` for `{job}`" + ))) + .unwrap_or_else(|_| { + panic!("worker should be able to report ack reception") + }); + } + } + + event_tx + .send(WorkerEvent::Log( + "[worker] bounded command channel closed; worker is done".into(), + )) + .unwrap_or_else(|_| panic!("worker should be able to report shutdown")); + }); + }, + || log_event!(12, "[main] worker exited"), + ); + + queue_future(async move { + log_event!(1, "[main] bounded mpsc send: enqueue `prepare-scene`"); + job_tx + .send("prepare-scene") + .await + .expect("prepare-scene should be sent"); + + log_event!( + 2, + "[main] bounded mpsc send: enqueue `upload-frame` (fits once worker drains capacity)" + ); + job_tx + .send("upload-frame") + .await + .expect("upload-frame should be sent"); + + log_event!( + 3, + "[main] bounded mpsc send: enqueue `flush-stats` (waits for capacity/backpressure)" + ); + job_tx + .send("flush-stats") + .await + .expect("flush-stats should be sent"); + + log_event!( + 5, + "[main] drop bounded sender to close worker command stream" + ); + drop(job_tx); + }); + + let mut event_count = 0usize; + while let Some(event) = event_rx.recv().await { + event_count += 1; + match event { + WorkerEvent::Log(message) => { + let expected = match event_count { + 1 => 4, + 2 => 6, + 4 => 9, + 5 => 10, + 6 => 11, + _ => 10 + event_count, + }; + log_event_impl(expected, format_args!("{message}")); + } + WorkerEvent::PresentRequest { frame, ack } => { + log_event!( + 7, + "[main] unbounded mpsc recv: worker requests presentation for `{frame}`" + ); + ack.send("presented") + .expect("main thread should be able to answer oneshot"); + log_event!(8, "[main] oneshot send: acknowledged frame presentation"); + } + } + } + + let _ = worker; + Ok(()) +} diff --git a/lib/runtime/examples/hyper_http_client.rs b/lib/runtime/examples/hyper_http_client.rs new file mode 100644 index 0000000..8d7e392 --- /dev/null +++ b/lib/runtime/examples/hyper_http_client.rs @@ -0,0 +1,75 @@ +use std::io::{Read as _, Write as _}; +use std::net::TcpListener as StdTcpListener; +use std::thread; +use std::time::Duration; + +use bytes::Bytes; +use http_body_util::{BodyExt, Empty}; +use hyper::Request; +use ruin_runtime::time::sleep; +use ruin_runtime::{clear_interval, queue_future, set_interval}; + +fn spawn_demo_server() -> std::io::Result<(std::net::SocketAddr, thread::JoinHandle<()>)> { + let listener = StdTcpListener::bind(("127.0.0.1", 0))?; + let address = listener.local_addr()?; + let handle = thread::Builder::new() + .name("hyper-demo-server".into()) + .spawn(move || { + let (mut stream, peer) = listener.accept().expect("demo server should accept"); + let mut request = [0; 1024]; + let read = stream.read(&mut request).expect("demo server should read"); + println!("[server] accepted {peer}, saw {} request bytes", read); + + let response = concat!( + "HTTP/1.1 200 OK\r\n", + "content-type: text/plain; charset=utf-8\r\n", + "content-length: 24\r\n", + "connection: close\r\n", + "\r\n", + "hello from ruin runtime!" + ); + stream + .write_all(response.as_bytes()) + .expect("demo server should reply"); + }) + .map_err(std::io::Error::other)?; + Ok((address, handle)) +} + +#[ruin_runtime::async_main] +async fn main() -> Result<(), Box> { + let (address, server) = spawn_demo_server()?; + + let stream = ruin_runtime::net::TcpStream::connect(address).await?; + let (mut sender, connection) = hyper::client::conn::http1::handshake(stream).await?; + queue_future(async move { + if let Err(error) = connection.await { + eprintln!("[runtime] hyper connection ended with error: {error}"); + } + }); + + println!("Sleeping a moment to let the server start..."); + let interval = set_interval(Duration::from_millis(400), || println!("...")); + sleep(Duration::from_secs(2)).await; + clear_interval(&interval); + println!("Let's go!"); + + let request = Request::builder() + .method("GET") + .uri(format!("http://{address}/demo")) + .header("host", address.to_string()) + .body(Empty::::new())?; + let response = sender.send_request(request).await?; + let status = response.status(); + let body = response.into_body().collect().await?.to_bytes(); + + println!( + "[client] status={status}, body={}", + String::from_utf8_lossy(&body) + ); + + server + .join() + .expect("demo server thread should exit cleanly"); + Ok(()) +} diff --git a/lib/runtime/examples/runtime_loop_showcase.rs b/lib/runtime/examples/runtime_loop_showcase.rs new file mode 100644 index 0000000..98c7da9 --- /dev/null +++ b/lib/runtime/examples/runtime_loop_showcase.rs @@ -0,0 +1,228 @@ +use ruin_runtime::{ + IntervalHandle, ThreadHandle, clear_interval, current_thread_handle, queue_future, + queue_microtask, queue_task, set_interval, set_timeout, spawn_worker, yield_now, +}; +use std::cell::{Cell, RefCell}; +use std::fmt; +use std::rc::Rc; +use std::sync::OnceLock; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +static START: OnceLock = OnceLock::new(); +static ACTUAL_ORDER: AtomicUsize = AtomicUsize::new(1); + +macro_rules! log_event { + ($expected:literal, $($arg:tt)*) => {{ + log_event_impl($expected, format_args!($($arg)*)); + }}; +} + +fn log_event_impl(expected: usize, message: fmt::Arguments<'_>) { + let actual = ACTUAL_ORDER.fetch_add(1, Ordering::SeqCst); + let elapsed = START + .get() + .expect("showcase start time should be initialized") + .elapsed() + .as_millis(); + println!( + "[actual {actual:02} | expected {expected:02} | +{elapsed:04}ms | ts {}] {message}", + unix_timestamp_millis(), + ); +} + +fn unix_timestamp_millis() -> String { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system clock should be after the Unix epoch"); + format!("{}.{:03}", now.as_secs(), now.subsec_millis()) +} + +fn queue_log(handle: &ThreadHandle, expected: usize, message: impl Into) { + let message = message.into(); + let queued = handle.queue_task(move || { + log_event_impl(expected, format_args!("{message}")); + }); + assert!(queued, "main thread should accept log task {expected}"); +} + +fn queue_log_microtask(handle: &ThreadHandle, expected: usize, message: impl Into) { + let message = message.into(); + let queued = handle.queue_microtask(move || { + log_event_impl(expected, format_args!("{message}")); + }); + assert!(queued, "main thread should accept log microtask {expected}"); +} + +#[ruin_runtime::main] +fn main() { + START.get_or_init(Instant::now); + + queue_microtask(|| log_event!(1, "[main] boot microtask: prime UI state")); + + queue_future(async { + log_event!(2, "[main] future: fetch scene metadata"); + yield_now().await; + log_event!(4, "[main] future: scene metadata cached"); + }); + + queue_microtask(|| { + log_event!(3, "[main] microtask queued immediately"); + }); + + let main_handle = current_thread_handle(); + queue_task(move || { + log_event!( + 5, + "[main] boot task: paint first frame and start background worker" + ); + + let dashboard_interval = Rc::new(RefCell::new(None::)); + let dashboard_ticks = Rc::new(Cell::new(0usize)); + { + let slot = Rc::clone(&dashboard_interval); + let ticks = Rc::clone(&dashboard_ticks); + set_dashboard_interval(slot, ticks); + } + + set_timeout(Duration::from_millis(30), || { + log_event!(11, "[main] timeout: network snapshot ready"); + }); + + let main_for_worker = main_handle.clone(); + let worker = spawn_worker( + move || { + queue_log( + &main_for_worker, + 6, + "[worker->main] startup task: prepare upload queue", + ); + + { + let main_for_microtask = main_for_worker.clone(); + queue_microtask(move || { + queue_log( + &main_for_microtask, + 7, + "[worker->main] microtask: inspect staging buffers", + ); + }); + } + + { + let main_for_future = main_for_worker.clone(); + queue_future(async move { + queue_log( + &main_for_future, + 8, + "[worker->main] future: compile shader variants", + ); + yield_now().await; + queue_log( + &main_for_future, + 9, + "[worker->main] future: shader cache is warm", + ); + }); + } + + { + let main_for_task = main_for_worker.clone(); + queue_task(move || { + queue_log( + &main_for_task, + 10, + "[worker->main] task: upload static geometry", + ); + }); + } + + let sample_interval = Rc::new(RefCell::new(None::)); + let sample_count = Rc::new(Cell::new(0usize)); + { + let slot = Rc::clone(&sample_interval); + let count = Rc::clone(&sample_count); + let main_for_samples = main_for_worker.clone(); + let handle = set_interval(Duration::from_millis(40), move || { + let next = count.get() + 1; + count.set(next); + queue_log( + &main_for_samples, + if next == 1 { 12 } else { 17 }, + format!("[worker->main] interval: sample batch {next} ready"), + ); + if next == 2 { + let interval = slot.borrow_mut().take().expect("interval should exist"); + clear_interval(&interval); + queue_log(&main_for_samples, 18, "[worker->main] interval stopped"); + } + }); + *sample_interval.borrow_mut() = Some(handle); + } + + { + let main_for_flush = main_for_worker.clone(); + set_timeout(Duration::from_millis(110), move || { + queue_log_microtask( + &main_for_flush, + 20, + "[worker->main] timeout: flushed final upload batch", + ); + }); + } + }, + || log_event!(21, "[main] worker exited"), + ); + + set_timeout(Duration::from_millis(70), move || { + let queued = worker.queue_task({ + let main_from_remote_task = main_handle.clone(); + move || { + queue_log( + &main_from_remote_task, + 15, + "[worker->main] remote task: upload late texture atlas", + ); + + let main_from_remote_microtask = main_from_remote_task.clone(); + queue_microtask(move || { + queue_log( + &main_from_remote_microtask, + 16, + "[worker->main] remote microtask: retire staging pages", + ); + }); + } + }); + + log_event!( + 14, + "[main] timeout: queue late texture upload on worker (queued={queued})" + ); + }); + + set_timeout(Duration::from_millis(140), || { + log_event!(22, "[main] final timeout: commit frame statistics"); + }); + }); +} + +fn set_dashboard_interval(slot: Rc>>, ticks: Rc>) { + let slot_for_callback = Rc::clone(&slot); + let handle = set_interval(Duration::from_millis(50), move || { + let next = ticks.get() + 1; + ticks.set(next); + if next == 1 { + log_event!(13, "[main] interval: dashboard tick 1"); + return; + } + + let interval = slot_for_callback + .borrow_mut() + .take() + .expect("interval should exist"); + clear_interval(&interval); + log_event!(19, "[main] interval: dashboard tick 2 and stop"); + }); + *slot.borrow_mut() = Some(handle); +} diff --git a/lib/runtime/src/channel/mod.rs b/lib/runtime/src/channel/mod.rs new file mode 100644 index 0000000..b58519e --- /dev/null +++ b/lib/runtime/src/channel/mod.rs @@ -0,0 +1,4 @@ +//! Async channels for inter-thread communication. + +pub mod mpsc; +pub mod oneshot; diff --git a/lib/runtime/src/channel/mpsc.rs b/lib/runtime/src/channel/mpsc.rs new file mode 100644 index 0000000..db4634c --- /dev/null +++ b/lib/runtime/src/channel/mpsc.rs @@ -0,0 +1,575 @@ +use std::collections::VecDeque; +use std::future::poll_fn; +use std::pin::Pin; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll}; + +use crate::op::completion::{CompletionFuture, CompletionHandle}; +use crate::sys::linux::channel::runtime_waiter; + +pub fn channel(capacity: usize) -> (Sender, Receiver) { + assert!(capacity > 0, "bounded channels require capacity > 0"); + let shared = Arc::new(Mutex::new(State::new(Some(capacity)))); + ( + Sender { + shared: Arc::clone(&shared), + }, + Receiver { shared }, + ) +} + +pub fn unbounded_channel() -> (UnboundedSender, Receiver) { + let shared = Arc::new(Mutex::new(State::new(None))); + ( + UnboundedSender { + shared: Arc::clone(&shared), + }, + Receiver { shared }, + ) +} + +pub struct Sender { + shared: Arc>>, +} + +pub struct UnboundedSender { + shared: Arc>>, +} + +pub struct Receiver { + shared: Arc>>, +} + +struct State { + queue: VecDeque, + capacity: Option, + sender_count: usize, + receiver_closed: bool, + recv_waiter: Option>>, + send_waiters: VecDeque>, + next_waiter_id: usize, +} + +struct SendWaiter { + id: usize, + value: T, + handle: CompletionHandle>>, +} + +#[derive(Debug, Eq, PartialEq)] +pub struct SendError(pub T); + +#[derive(Debug, Eq, PartialEq)] +pub enum TrySendError { + Full(T), + Closed(T), +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum TryRecvError { + Empty, + Disconnected, +} + +impl State { + fn new(capacity: Option) -> Self { + Self { + queue: VecDeque::new(), + capacity, + sender_count: 1, + receiver_closed: false, + recv_waiter: None, + send_waiters: VecDeque::new(), + next_waiter_id: 1, + } + } + + fn try_send_now(&mut self, value: T) -> Result<(), TrySendError> { + if self.receiver_closed { + return Err(TrySendError::Closed(value)); + } + + if let Some(waiter) = self.recv_waiter.take() { + waiter.complete(Some(value)); + return Ok(()); + } + + if self + .capacity + .is_some_and(|capacity| self.queue.len() >= capacity) + { + return Err(TrySendError::Full(value)); + } + + self.queue.push_back(value); + Ok(()) + } + + fn enqueue_send_waiter( + &mut self, + value: T, + handle: CompletionHandle>>, + ) -> usize { + let id = self.next_waiter_id; + self.next_waiter_id = self.next_waiter_id.wrapping_add(1); + self.send_waiters + .push_back(SendWaiter { id, value, handle }); + id + } + + fn remove_send_waiter(&mut self, waiter_id: usize) -> bool { + let Some(index) = self + .send_waiters + .iter() + .position(|waiter| waiter.id == waiter_id) + else { + return false; + }; + self.send_waiters.remove(index); + true + } + + fn pump_senders(&mut self) { + loop { + if self.receiver_closed { + self.fail_pending_senders(); + break; + } + + let has_capacity = self + .capacity + .is_none_or(|capacity| self.queue.len() < capacity); + if !has_capacity { + break; + } + + let Some(waiter) = self.send_waiters.pop_front() else { + break; + }; + + if let Some(receiver) = self.recv_waiter.take() { + receiver.complete(Some(waiter.value)); + } else { + self.queue.push_back(waiter.value); + } + waiter.handle.complete(Ok(())); + } + + if self.queue.is_empty() + && self.sender_count == 0 + && let Some(waiter) = self.recv_waiter.take() + { + waiter.complete(None); + } + } + + fn fail_pending_senders(&mut self) { + while let Some(waiter) = self.send_waiters.pop_front() { + waiter.handle.complete(Err(SendError(waiter.value))); + } + } + + fn close_receiver(&mut self) { + self.receiver_closed = true; + self.fail_pending_senders(); + if self.queue.is_empty() + && let Some(waiter) = self.recv_waiter.take() + { + waiter.complete(None); + } + } + + fn drop_sender(&mut self) { + self.sender_count = self.sender_count.saturating_sub(1); + if self.sender_count == 0 + && self.queue.is_empty() + && let Some(waiter) = self.recv_waiter.take() + { + waiter.complete(None); + } + } +} + +impl Clone for Sender { + fn clone(&self) -> Self { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .sender_count += 1; + Self { + shared: Arc::clone(&self.shared), + } + } +} + +impl Clone for UnboundedSender { + fn clone(&self) -> Self { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .sender_count += 1; + Self { + shared: Arc::clone(&self.shared), + } + } +} + +impl Sender { + pub async fn send(&self, value: T) -> Result<(), SendError> { + let mut value = Some(value); + let mut wait = None; + poll_fn(|cx| self.poll_send(cx, &mut value, &mut wait)).await + } + + pub fn try_send(&self, value: T) -> Result<(), TrySendError> { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .try_send_now(value) + } + + pub fn is_closed(&self) -> bool { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .receiver_closed + } + + fn poll_send( + &self, + cx: &mut Context<'_>, + value_slot: &mut Option, + wait: &mut Option>>>, + ) -> Poll>> { + if let Some(future) = wait.as_mut() { + match Pin::new(future).poll(cx) { + Poll::Ready(result) => { + wait.take(); + Poll::Ready(result) + } + Poll::Pending => Poll::Pending, + } + } else { + let mut state = self + .shared + .lock() + .expect("mpsc state should not be poisoned"); + match state.try_send_now(value_slot.take().expect("send value should be present")) { + Ok(()) => Poll::Ready(Ok(())), + Err(TrySendError::Closed(value)) => Poll::Ready(Err(SendError(value))), + Err(TrySendError::Full(returned)) => { + drop(state); + let (future, handle) = runtime_waiter::>>(); + let state_shared = Arc::clone(&self.shared); + let registration = { + let mut state = state_shared + .lock() + .expect("mpsc state should not be poisoned"); + match state.try_send_now(returned) { + Ok(()) => Ok(None), + Err(TrySendError::Closed(value)) => Err(SendError(value)), + Err(TrySendError::Full(value)) => { + Ok(Some(state.enqueue_send_waiter(value, handle.clone()))) + } + } + }; + match registration { + Ok(None) => { + handle.complete(Ok(())); + *wait = Some(future); + self.poll_send(cx, value_slot, wait) + } + Err(error) => { + handle.complete(Err(error)); + *wait = Some(future); + self.poll_send(cx, value_slot, wait) + } + Ok(Some(waiter_id)) => { + let cancel_shared = Arc::clone(&self.shared); + let cancel_handle = handle.clone(); + handle.set_cancel(move || { + let mut state = cancel_shared + .lock() + .expect("mpsc state should not be poisoned"); + let _ = state.remove_send_waiter(waiter_id); + drop(state); + cancel_handle.finish(None); + }); + *wait = Some(future); + self.poll_send(cx, value_slot, wait) + } + } + } + } + } + } +} + +impl UnboundedSender { + pub fn send(&self, value: T) -> Result<(), SendError> { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .try_send_now(value) + .map_err(|error| match error { + TrySendError::Full(value) | TrySendError::Closed(value) => SendError(value), + }) + } + + pub fn is_closed(&self) -> bool { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .receiver_closed + } +} + +impl Receiver { + pub async fn recv(&mut self) -> Option { + let mut wait = None; + poll_fn(|cx| self.poll_recv(cx, &mut wait)).await + } + + pub fn try_recv(&mut self) -> Result { + let mut state = self + .shared + .lock() + .expect("mpsc state should not be poisoned"); + if let Some(value) = state.queue.pop_front() { + state.pump_senders(); + Ok(value) + } else if state.sender_count == 0 || state.receiver_closed { + Err(TryRecvError::Disconnected) + } else { + Err(TryRecvError::Empty) + } + } + + pub fn close(&mut self) { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .close_receiver(); + } + + pub fn is_closed(&self) -> bool { + let state = self + .shared + .lock() + .expect("mpsc state should not be poisoned"); + state.receiver_closed || state.sender_count == 0 + } + + fn poll_recv( + &mut self, + cx: &mut Context<'_>, + wait: &mut Option>>, + ) -> Poll> { + if let Some(future) = wait.as_mut() { + match Pin::new(future).poll(cx) { + Poll::Ready(result) => { + wait.take(); + Poll::Ready(result) + } + Poll::Pending => Poll::Pending, + } + } else { + let (future, handle) = runtime_waiter::>(); + let cancel_shared = Arc::clone(&self.shared); + let cancel_handle = handle.clone(); + handle.set_cancel(move || { + let mut state = cancel_shared + .lock() + .expect("mpsc state should not be poisoned"); + let _ = state.recv_waiter.take(); + drop(state); + cancel_handle.finish(None); + }); + + { + let mut state = self + .shared + .lock() + .expect("mpsc state should not be poisoned"); + if let Some(value) = state.queue.pop_front() { + state.pump_senders(); + handle.complete(Some(value)); + } else if state.receiver_closed || state.sender_count == 0 { + handle.complete(None); + } else { + assert!( + state.recv_waiter.is_none(), + "only one mpsc receive operation may wait at a time" + ); + state.recv_waiter = Some(handle.clone()); + } + } + + *wait = Some(future); + self.poll_recv(cx, wait) + } + } +} + +impl Drop for Sender { + fn drop(&mut self) { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .drop_sender(); + } +} + +impl Drop for UnboundedSender { + fn drop(&mut self) { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .drop_sender(); + } +} + +impl Drop for Receiver { + fn drop(&mut self) { + self.shared + .lock() + .expect("mpsc state should not be poisoned") + .close_receiver(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + use std::time::Duration; + + use crate::time::sleep; + use crate::{queue_future, queue_task, run, spawn_worker}; + + use super::{TryRecvError, TrySendError, channel, unbounded_channel}; + + #[test] + fn bounded_channel_applies_backpressure() { + let log = Arc::new(Mutex::new(Vec::::new())); + let log_for_task = Arc::clone(&log); + + queue_task(move || { + let (sender, mut receiver) = channel(1); + let log_for_sender = Arc::clone(&log_for_task); + let log_for_receiver = Arc::clone(&log_for_task); + + queue_future(async move { + sender + .send("first") + .await + .expect("first send should succeed"); + log_for_sender + .lock() + .unwrap() + .push("sent first".to_string()); + sender + .send("second") + .await + .expect("second send should succeed"); + log_for_sender + .lock() + .unwrap() + .push("sent second".to_string()); + }); + + queue_future(async move { + sleep(Duration::from_millis(5)).await; + let first = receiver.recv().await.expect("first recv should succeed"); + log_for_receiver + .lock() + .unwrap() + .push(format!("received {first}")); + let second = receiver.recv().await.expect("second recv should succeed"); + log_for_receiver + .lock() + .unwrap() + .push(format!("received {second}")); + }); + }); + run(); + + let log = log.lock().unwrap(); + let sent_first = log.iter().position(|entry| entry == "sent first").unwrap(); + let received_first = log + .iter() + .position(|entry| entry == "received first") + .unwrap(); + let sent_second = log.iter().position(|entry| entry == "sent second").unwrap(); + let received_second = log + .iter() + .position(|entry| entry == "received second") + .unwrap(); + + assert!( + sent_first < received_first, + "first send should happen before first recv" + ); + assert!( + received_first < sent_second, + "second send should not complete before capacity is freed" + ); + assert!( + received_first < received_second, + "receiver should observe messages in FIFO order" + ); + } + + #[test] + fn unbounded_channel_moves_messages_across_worker_threads() { + let log = Arc::new(Mutex::new(Vec::new())); + let log_for_task = Arc::clone(&log); + + queue_task(move || { + let (sender, mut receiver) = unbounded_channel::(); + let worker_sender = sender.clone(); + let log_for_receiver = Arc::clone(&log_for_task); + + let _worker = spawn_worker( + move || { + queue_task(move || { + worker_sender + .send("worker boot".into()) + .expect("worker boot send should succeed"); + worker_sender + .send("worker done".into()) + .expect("worker done send should succeed"); + }); + }, + || {}, + ); + drop(sender); + + queue_future(async move { + while let Some(message) = receiver.recv().await { + log_for_receiver.lock().unwrap().push(message); + } + }); + }); + run(); + + assert_eq!( + log.lock().unwrap().as_slice(), + ["worker boot", "worker done"] + ); + } + + #[test] + fn try_send_try_recv_and_close_semantics_work() { + let (sender, mut receiver) = channel(1); + sender + .try_send(1usize) + .expect("initial send should succeed"); + assert_eq!(sender.try_send(2usize), Err(TrySendError::Full(2))); + assert_eq!(receiver.try_recv(), Ok(1)); + assert_eq!(receiver.try_recv(), Err(TryRecvError::Empty)); + receiver.close(); + assert!(sender.is_closed(), "sender should observe closed receiver"); + assert_eq!(sender.try_send(3usize), Err(TrySendError::Closed(3))); + assert_eq!(receiver.try_recv(), Err(TryRecvError::Disconnected)); + } +} diff --git a/lib/runtime/src/channel/oneshot.rs b/lib/runtime/src/channel/oneshot.rs new file mode 100644 index 0000000..79ea4c4 --- /dev/null +++ b/lib/runtime/src/channel/oneshot.rs @@ -0,0 +1,281 @@ +use std::future::poll_fn; +use std::pin::Pin; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll}; + +use crate::op::completion::{CompletionFuture, CompletionHandle}; +use crate::sys::linux::channel::runtime_waiter; + +pub fn channel() -> (Sender, Receiver) { + let shared = Arc::new(Mutex::new(State { + value: None, + sender_alive: true, + receiver_closed: false, + waiter: None, + })); + ( + Sender { + shared: Some(Arc::clone(&shared)), + }, + Receiver { + shared, + consumed: false, + }, + ) +} + +pub struct Sender { + shared: Option>>>, +} + +pub struct Receiver { + shared: Arc>>, + consumed: bool, +} + +struct State { + value: Option, + sender_alive: bool, + receiver_closed: bool, + waiter: Option>>, +} + +#[derive(Debug, Eq, PartialEq)] +pub struct SendError(pub T); + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct RecvError; + +#[derive(Debug, Eq, PartialEq)] +pub enum TryRecvError { + Empty, + Closed, +} + +impl Sender { + pub fn send(mut self, value: T) -> Result<(), SendError> { + let Some(shared) = self.shared.take() else { + return Err(SendError(value)); + }; + + let waiter = { + let mut state = shared.lock().expect("oneshot state should not be poisoned"); + state.sender_alive = false; + if state.receiver_closed { + return Err(SendError(value)); + } + + state.waiter.take() + }; + + if let Some(waiter) = waiter { + waiter.complete(Ok(value)); + } else { + shared + .lock() + .expect("oneshot state should not be poisoned") + .value = Some(value); + } + + Ok(()) + } + + pub fn is_closed(&self) -> bool { + self.shared.as_ref().is_none_or(|shared| { + shared + .lock() + .expect("oneshot state should not be poisoned") + .receiver_closed + }) + } +} + +impl Receiver { + pub async fn recv(&mut self) -> Result { + let mut wait = None; + poll_fn(|cx| self.poll_recv(cx, &mut wait)).await + } + + pub fn try_recv(&mut self) -> Result { + if self.consumed { + return Err(TryRecvError::Closed); + } + + let mut state = self + .shared + .lock() + .expect("oneshot state should not be poisoned"); + if let Some(value) = state.value.take() { + self.consumed = true; + return Ok(value); + } + + if state.receiver_closed || !state.sender_alive { + self.consumed = true; + Err(TryRecvError::Closed) + } else { + Err(TryRecvError::Empty) + } + } + + pub fn close(&mut self) { + let mut state = self + .shared + .lock() + .expect("oneshot state should not be poisoned"); + state.receiver_closed = true; + } + + pub fn is_closed(&self) -> bool { + let state = self + .shared + .lock() + .expect("oneshot state should not be poisoned"); + state.receiver_closed || !state.sender_alive + } + + fn poll_recv( + &mut self, + cx: &mut Context<'_>, + wait: &mut Option>>, + ) -> Poll> { + if self.consumed { + return Poll::Ready(Err(RecvError)); + } + + if let Some(future) = wait.as_mut() { + match Pin::new(future).poll(cx) { + Poll::Ready(result) => { + wait.take(); + self.consumed = true; + Poll::Ready(result) + } + Poll::Pending => Poll::Pending, + } + } else { + let (future, handle) = runtime_waiter::>(); + let cancel_shared = Arc::clone(&self.shared); + let cancel_handle = handle.clone(); + handle.set_cancel(move || { + let mut state = cancel_shared + .lock() + .expect("oneshot state should not be poisoned"); + let _ = state.waiter.take(); + drop(state); + cancel_handle.finish(None); + }); + + let mut immediate = None; + { + let mut state = self + .shared + .lock() + .expect("oneshot state should not be poisoned"); + if let Some(value) = state.value.take() { + immediate = Some(Ok(value)); + } else if state.receiver_closed || !state.sender_alive { + immediate = Some(Err(RecvError)); + } else { + assert!( + state.waiter.is_none(), + "only one oneshot receive operation may wait at a time" + ); + state.waiter = Some(handle.clone()); + } + } + + if let Some(result) = immediate { + handle.complete(result); + } + + *wait = Some(future); + self.poll_recv(cx, wait) + } + } +} + +impl Drop for Sender { + fn drop(&mut self) { + let Some(shared) = self.shared.take() else { + return; + }; + + let waiter = { + let mut state = shared.lock().expect("oneshot state should not be poisoned"); + if !state.sender_alive { + return; + } + + state.sender_alive = false; + if state.value.is_none() { + state.waiter.take() + } else { + None + } + }; + + if let Some(waiter) = waiter { + waiter.complete(Err(RecvError)); + } + } +} + +impl Drop for Receiver { + fn drop(&mut self) { + let mut state = self + .shared + .lock() + .expect("oneshot state should not be poisoned"); + state.receiver_closed = true; + let _ = state.waiter.take(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use crate::{queue_future, queue_task, run, spawn_worker}; + + use super::{TryRecvError, channel}; + + #[test] + fn oneshot_cross_thread_round_trip() { + let result = Arc::new(Mutex::new(None::)); + let result_for_task = Arc::clone(&result); + + queue_task(move || { + let (sender, mut receiver) = channel(); + let result_for_task = Arc::clone(&result_for_task); + + let _worker = spawn_worker( + move || { + queue_task(move || { + sender.send(42usize).expect("oneshot send should succeed"); + }); + }, + || {}, + ); + + queue_future(async move { + let value = receiver.recv().await.expect("oneshot recv should succeed"); + *result_for_task.lock().unwrap() = Some(value); + }); + }); + run(); + + assert_eq!(*result.lock().unwrap(), Some(42)); + } + + #[test] + fn oneshot_try_recv_and_close() { + let (sender, mut receiver) = channel::(); + assert_eq!(receiver.try_recv(), Err(TryRecvError::Empty)); + receiver.close(); + assert!( + sender.send(7).is_err(), + "closed receiver should reject send" + ); + assert_eq!(receiver.try_recv(), Err(TryRecvError::Closed)); + } +} diff --git a/lib/runtime/src/fs.rs b/lib/runtime/src/fs.rs new file mode 100644 index 0000000..aa33f2d --- /dev/null +++ b/lib/runtime/src/fs.rs @@ -0,0 +1,552 @@ +//! Portable async filesystem API. +//! +//! Cancellation semantics: +//! - Dropping an I/O future cancels interest in the result. +//! - The runtime issues best-effort kernel cancellation where supported. +//! - The underlying OS operation may still complete after the future is dropped. + +use std::ffi::OsStr; +use std::io; +use std::os::fd::{AsRawFd, OwnedFd}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use crate::op::fs::{ + FileType as RawFileType, FsOp, MetadataTarget, OpenOptions as OpOpenOptions, + RawDirEntry as OpDirEntry, RawMetadata, +}; +use crate::sys::linux::fs as sys_fs; + +struct FileInner { + fd: OwnedFd, +} + +pub struct File { + inner: Arc, +} + +pub struct OpenOptions { + inner: OpOpenOptions, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Metadata { + inner: RawMetadata, +} + +pub struct ReadDir { + inner: sys_fs::ReadDirStream, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct DirEntry { + inner: OpDirEntry, +} + +impl File { + pub async fn open(path: impl AsRef) -> io::Result { + OpenOptions::new().read(true).open(path).await + } + + pub async fn create(path: impl AsRef) -> io::Result { + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(path) + .await + } + + pub async fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.read_impl(None, buf).await + } + + pub async fn read_exact(&mut self, mut buf: &mut [u8]) -> io::Result<()> { + while !buf.is_empty() { + let read = self.read(buf).await?; + if read == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "failed to fill whole buffer", + )); + } + buf = &mut buf[read..]; + } + Ok(()) + } + + pub async fn write(&mut self, buf: &[u8]) -> io::Result { + self.write_impl(None, buf).await + } + + pub async fn write_all(&mut self, mut buf: &[u8]) -> io::Result<()> { + while !buf.is_empty() { + let written = self.write(buf).await?; + if written == 0 { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "failed to write whole buffer", + )); + } + buf = &buf[written..]; + } + Ok(()) + } + + pub async fn flush(&mut self) -> io::Result<()> { + Ok(()) + } + + pub async fn sync_all(&self) -> io::Result<()> { + sys_fs::sync_all(FsOp::SyncAll { fd: self.raw_fd() }).await + } + + pub async fn sync_data(&self) -> io::Result<()> { + sys_fs::sync_data(FsOp::SyncData { fd: self.raw_fd() }).await + } + + pub async fn read_at(&self, offset: u64, buf: &mut [u8]) -> io::Result { + self.read_impl(Some(offset), buf).await + } + + pub async fn read_exact_at(&self, mut offset: u64, mut buf: &mut [u8]) -> io::Result<()> { + while !buf.is_empty() { + let read = self.read_at(offset, buf).await?; + if read == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "failed to fill whole buffer", + )); + } + offset = offset.saturating_add(read as u64); + buf = &mut buf[read..]; + } + Ok(()) + } + + pub async fn write_at(&self, offset: u64, buf: &[u8]) -> io::Result { + self.write_impl(Some(offset), buf).await + } + + pub async fn write_all_at(&self, mut offset: u64, mut buf: &[u8]) -> io::Result<()> { + while !buf.is_empty() { + let written = self.write_at(offset, buf).await?; + if written == 0 { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "failed to write whole buffer", + )); + } + offset = offset.saturating_add(written as u64); + buf = &buf[written..]; + } + Ok(()) + } + + pub async fn metadata(&self) -> io::Result { + sys_fs::metadata(FsOp::Metadata { + target: MetadataTarget::File(self.raw_fd()), + follow_symlinks: true, + }) + .await + .map(Metadata::from_raw) + } + + pub async fn set_len(&self, len: u64) -> io::Result<()> { + sys_fs::set_len(FsOp::SetLen { + fd: self.raw_fd(), + len, + }) + .await + } + + pub async fn try_clone(&self) -> io::Result { + sys_fs::try_clone(FsOp::Duplicate { fd: self.raw_fd() }) + .await + .map(File::from_owned_fd) + } + + fn from_owned_fd(fd: OwnedFd) -> Self { + Self { + inner: Arc::new(FileInner { fd }), + } + } + + fn raw_fd(&self) -> i32 { + self.inner.fd.as_raw_fd() + } + + async fn read_impl(&self, offset: Option, buf: &mut [u8]) -> io::Result { + let data = sys_fs::read(FsOp::Read { + fd: self.raw_fd(), + offset, + len: buf.len(), + }) + .await?; + + let read = data.len(); + buf[..read].copy_from_slice(&data); + Ok(read) + } + + async fn write_impl(&self, offset: Option, buf: &[u8]) -> io::Result { + sys_fs::write(FsOp::Write { + fd: self.raw_fd(), + offset, + data: buf.to_vec(), + }) + .await + } +} + +impl OpenOptions { + pub fn new() -> Self { + Self { + inner: OpOpenOptions::default(), + } + } + + pub fn read(&mut self, value: bool) -> &mut Self { + self.inner.read = value; + self + } + + pub fn write(&mut self, value: bool) -> &mut Self { + self.inner.write = value; + self + } + + pub fn append(&mut self, value: bool) -> &mut Self { + self.inner.append = value; + self + } + + pub fn truncate(&mut self, value: bool) -> &mut Self { + self.inner.truncate = value; + self + } + + pub fn create(&mut self, value: bool) -> &mut Self { + self.inner.create = value; + self + } + + pub fn create_new(&mut self, value: bool) -> &mut Self { + self.inner.create_new = value; + self + } + + pub async fn open(&self, path: impl AsRef) -> io::Result { + sys_fs::open(FsOp::Open { + path: path.as_ref().to_path_buf(), + options: self.inner.clone(), + }) + .await + .map(File::from_owned_fd) + } +} + +impl Default for OpenOptions { + fn default() -> Self { + Self::new() + } +} + +impl Metadata { + fn from_raw(inner: RawMetadata) -> Self { + Self { inner } + } + + pub fn len(&self) -> u64 { + self.inner.len + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn is_file(&self) -> bool { + self.inner.file_type == RawFileType::File + } + + pub fn is_dir(&self) -> bool { + self.inner.file_type == RawFileType::Directory + } + + pub fn is_symlink(&self) -> bool { + self.inner.file_type == RawFileType::Symlink + } + + pub fn mode(&self) -> u16 { + self.inner.mode + } +} + +impl ReadDir { + pub async fn next_entry(&mut self) -> io::Result> { + self.inner + .next_entry() + .await + .map(|entry| entry.map(|inner| DirEntry { inner })) + } +} + +impl DirEntry { + pub fn path(&self) -> PathBuf { + self.inner.path.clone() + } + + pub fn file_name(&self) -> &OsStr { + self.inner.file_name.as_os_str() + } + + pub async fn metadata(&self) -> io::Result { + metadata(self.path()).await + } +} + +pub async fn read(path: impl AsRef) -> io::Result> { + let mut file = File::open(path.as_ref()).await?; + let mut output = Vec::new(); + let mut chunk = vec![0; 8192]; + + loop { + let read = file.read(&mut chunk).await?; + if read == 0 { + return Ok(output); + } + output.extend_from_slice(&chunk[..read]); + } +} + +pub async fn read_to_string(path: impl AsRef) -> io::Result { + let bytes = read(path).await?; + String::from_utf8(bytes).map_err(|error| io::Error::new(io::ErrorKind::InvalidData, error)) +} + +pub async fn write(path: impl AsRef, data: impl AsRef<[u8]>) -> io::Result<()> { + let mut file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(path) + .await?; + file.write_all(data.as_ref()).await +} + +pub async fn metadata(path: impl AsRef) -> io::Result { + sys_fs::metadata(FsOp::Metadata { + target: MetadataTarget::Path(path.as_ref().to_path_buf()), + follow_symlinks: true, + }) + .await + .map(Metadata::from_raw) +} + +pub async fn create_dir(path: impl AsRef) -> io::Result<()> { + sys_fs::create_dir(FsOp::CreateDir { + path: path.as_ref().to_path_buf(), + recursive: false, + mode: 0o777, + }) + .await +} + +pub async fn create_dir_all(path: impl AsRef) -> io::Result<()> { + let path = path.as_ref(); + let mut current = PathBuf::new(); + + for component in path.components() { + current.push(component.as_os_str()); + if current.as_os_str().is_empty() { + continue; + } + + match create_dir(¤t).await { + Ok(()) => {} + Err(error) if error.kind() == io::ErrorKind::AlreadyExists => {} + Err(error) => return Err(error), + } + } + + Ok(()) +} + +pub async fn remove_file(path: impl AsRef) -> io::Result<()> { + sys_fs::remove_file(FsOp::RemoveFile { + path: path.as_ref().to_path_buf(), + }) + .await +} + +pub async fn remove_dir(path: impl AsRef) -> io::Result<()> { + sys_fs::remove_dir(FsOp::RemoveDir { + path: path.as_ref().to_path_buf(), + }) + .await +} + +pub async fn rename(from: impl AsRef, to: impl AsRef) -> io::Result<()> { + sys_fs::rename(FsOp::Rename { + from: from.as_ref().to_path_buf(), + to: to.as_ref().to_path_buf(), + }) + .await +} + +pub async fn read_dir(path: impl AsRef) -> io::Result { + sys_fs::read_dir(FsOp::ReadDir { + path: path.as_ref().to_path_buf(), + }) + .map(|inner| ReadDir { inner }) +} + +#[cfg(test)] +mod tests { + use super::{ + OpenOptions, create_dir_all, metadata, read, read_dir, read_to_string, remove_dir, + remove_file, rename, write, + }; + use crate::queue_future; + use crate::{queue_task, run}; + use std::collections::BTreeSet; + use std::ffi::OsString; + use std::path::PathBuf; + use std::process; + use std::sync::Arc; + use std::sync::Mutex; + use std::sync::OnceLock; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn test_lock() -> &'static Mutex<()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())) + } + + fn unique_path(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time should be after epoch") + .as_nanos(); + std::env::temp_dir().join(format!("ruin-runtime-{label}-{}-{nanos}", process::id())) + } + + #[test] + fn async_fs_round_trip() { + let _guard = test_lock().lock().unwrap(); + let root = unique_path("fs-round-trip"); + let nested = root.join("nested"); + let file_path = nested.join("hello.txt"); + let renamed_path = nested.join("renamed.txt"); + let output = Arc::new(Mutex::new(None::)); + + { + let output = Arc::clone(&output); + queue_task(move || { + queue_future(async move { + create_dir_all(&nested) + .await + .expect("dir creation should succeed"); + write(&file_path, b"hello world") + .await + .expect("initial write should succeed"); + + let file = OpenOptions::new() + .read(true) + .write(true) + .open(&file_path) + .await + .expect("open should succeed"); + file.write_at(6, b"runtime") + .await + .expect("positioned write should succeed"); + file.sync_all().await.expect("sync should succeed"); + + let mut prefix = [0u8; 5]; + file.read_exact_at(0, &mut prefix) + .await + .expect("positioned read should succeed"); + assert_eq!(&prefix, b"hello"); + + let meta = file.metadata().await.expect("metadata should succeed"); + assert!(meta.is_file()); + assert!(meta.len() >= 13); + + let cloned = file.try_clone().await.expect("clone should succeed"); + cloned.set_len(13).await.expect("truncate should succeed"); + + rename(&file_path, &renamed_path) + .await + .expect("rename should succeed"); + let text = read_to_string(&renamed_path) + .await + .expect("read_to_string should succeed"); + assert_eq!(text, "hello runtime"); + + let bytes = read(&renamed_path).await.expect("read should succeed"); + assert_eq!(bytes, b"hello runtime"); + + let path_meta = metadata(&renamed_path) + .await + .expect("path metadata should work"); + assert!(path_meta.is_file()); + + *output.lock().unwrap() = Some(text); + + remove_file(&renamed_path) + .await + .expect("remove_file should succeed"); + remove_dir(&nested) + .await + .expect("remove nested dir should succeed"); + remove_dir(&root) + .await + .expect("remove root dir should succeed"); + }); + }); + } + + run(); + + assert_eq!(output.lock().unwrap().as_deref(), Some("hello runtime")); + } + + #[test] + fn async_read_dir_streams_entries() { + let _guard = test_lock().lock().unwrap(); + let root = unique_path("fs-read-dir"); + let one = root.join("one.txt"); + let two = root.join("two.txt"); + let seen: Arc>> = Arc::new(Mutex::new(BTreeSet::new())); + + { + let seen = Arc::clone(&seen); + queue_task(move || { + queue_future(async move { + create_dir_all(&root) + .await + .expect("dir creation should succeed"); + write(&one, b"1").await.expect("write one should succeed"); + write(&two, b"2").await.expect("write two should succeed"); + + let mut dir = read_dir(&root).await.expect("read_dir should succeed"); + while let Some(entry) = dir.next_entry().await.expect("stream should succeed") { + seen.lock() + .unwrap() + .insert(entry.file_name().to_os_string()); + } + + remove_file(&one).await.expect("remove one should succeed"); + remove_file(&two).await.expect("remove two should succeed"); + remove_dir(&root).await.expect("remove root should succeed"); + }); + }); + } + + run(); + + let seen = seen.lock().unwrap(); + assert!(seen.contains(&OsString::from("one.txt"))); + assert!(seen.contains(&OsString::from("two.txt"))); + } +} diff --git a/lib/runtime/src/lib.rs b/lib/runtime/src/lib.rs new file mode 100644 index 0000000..90390ff --- /dev/null +++ b/lib/runtime/src/lib.rs @@ -0,0 +1,78 @@ +//! RUIN runtime foundations. +//! +//! This crate provides a Linux x86_64 runtime substrate: the mesh allocator, the reactor, and a +//! single-threaded runtime loop with worker-thread task forwarding. + +#![feature(thread_local)] + +#[cfg(not(all(target_os = "linux", target_arch = "x86_64")))] +compile_error!("ruin-runtime currently supports only Linux x86_64."); + +extern crate alloc; + +pub mod channel; +pub mod fs; +pub mod net; +pub mod op; +pub mod platform; +pub mod sys; +pub mod time; + +pub use ruin_runtime_proc_macros::{async_main, main}; + +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub use platform::linux_x86_64::mesh_alloc::{ + ActiveMeshGuard, Arena, AtomicBitmap, BitIter, CLASS_TO_SIZE, CompactionAdvice, + CompactionEstimate, CompactionRecommendation, CompactionSkipReason, + DEFAULT_GLOBAL_MINIHEAP_CAPACITY, FutexMutex, GlobalMeshAllocator, MeshAllocator, MeshStats, + MiniHeap, MiniHeapFlags, MiniHeapId, Mwc, Mwc64, NUM_SIZE_CLASSES, PageConfig, PlatformHooks, + PlatformInstallError, RelaxedBitmap, RuntimeCompactionPolicy, RuntimeCompactionResult, + ShuffleEntry, ShuffleVector, Span, ThreadLocalHeap, byte_size_for_class, + ensure_fault_mediation_installed, install_platform_hooks, ok_to_proceed, page_count, + page_shift, page_size, retry_on_efault, retry_on_efault_ptrs, round_up_to_page, + runtime_slots_per_span, size_class_for, +}; +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub use platform::linux_x86_64::mesh_alloc::{FreelistId, bitmaps_meshable}; +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub use platform::linux_x86_64::reactor::{ + Reactor, ReadyEvents, ThreadNotifier, create, create_reactor, monotonic_now, +}; +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub use platform::linux_x86_64::runtime::{ + IntervalHandle, JoinHandle, ThreadHandle, TimeoutHandle, WorkerHandle, clear_interval, + clear_timeout, current_thread_handle, queue_future, queue_microtask, queue_task, run, + set_interval, set_timeout, spawn_worker, yield_now, +}; + +pub const fn default_global_allocator() -> GlobalMeshAllocator { + GlobalMeshAllocator::with_default_config() +} + +#[cfg(test)] +mod tests { + use super::{MeshAllocator, page_size}; + + #[test] + fn mesh_allocator_smoke_test() { + let mut allocator = + MeshAllocator::new(page_size() * 1024, 256).expect("allocator should initialize"); + + let small = allocator + .allocate(64) + .expect("small allocation should succeed"); + unsafe { + small.write_bytes(0xAB, 64); + } + allocator.deallocate(small); + + let large_size = page_size() * 2; + let large = allocator + .allocate(large_size) + .expect("large allocation should succeed"); + unsafe { + large.write_bytes(0xCD, large_size); + } + allocator.deallocate(large); + } +} diff --git a/lib/runtime/src/net.rs b/lib/runtime/src/net.rs new file mode 100644 index 0000000..aa9a961 --- /dev/null +++ b/lib/runtime/src/net.rs @@ -0,0 +1,963 @@ +//! Portable async networking API. + +use std::future::Future; +use std::io; +use std::net::{Shutdown, SocketAddr, ToSocketAddrs}; +use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::pin::Pin; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll}; +use std::time::Duration; + +use hyper::rt::{Read as HyperRead, ReadBufCursor, Write as HyperWrite}; + +use crate::op::net::NetOp; + +#[derive(Debug)] +struct TcpStreamInner { + fd: OwnedFd, + timeouts: Mutex, +} + +#[derive(Debug)] +struct TcpListenerInner { + fd: OwnedFd, +} + +#[derive(Debug)] +struct UdpSocketInner { + fd: OwnedFd, + timeouts: Mutex, +} + +#[derive(Clone, Copy, Debug, Default)] +struct SocketTimeouts { + read: Option, + write: Option, +} + +type PendingRead = Pin>> + 'static>>; +type PendingWrite = Pin> + 'static>>; +type PendingShutdown = Pin> + 'static>>; + +pub struct TcpStream { + inner: Arc, + pending_read: Option, + pending_write: Option, + pending_shutdown: Option, +} + +#[derive(Clone, Debug)] +pub struct TcpListener { + inner: Arc, +} + +#[derive(Debug)] +pub struct UdpSocket { + inner: Arc, +} + +impl TcpStream { + pub async fn connect(addr: A) -> io::Result + where + A: ToSocketAddrs + Send + 'static, + { + let addrs = crate::sys::linux::net::resolve_addrs(addr).await?; + let mut last_error = None; + for addr in addrs { + match crate::sys::linux::net::connect_stream(addr).await { + Ok(fd) => return Ok(Self::from_owned_fd(fd)), + Err(error) => last_error = Some(error), + } + } + + Err(last_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::AddrNotAvailable, + "address resolution returned no usable TCP endpoints", + ) + })) + } + + pub async fn connect_timeout(addr: &SocketAddr, timeout: Duration) -> io::Result { + validate_timeout(timeout)?; + crate::sys::linux::net::connect_stream_timeout(*addr, timeout) + .await + .map(Self::from_owned_fd) + } + + pub async fn read(&mut self, buf: &mut [u8]) -> io::Result { + let data = match self.read_timeout_value() { + Some(timeout) => { + crate::sys::linux::net::recv_timeout(self.raw_fd(), buf.len(), 0, timeout).await? + } + None => { + crate::sys::linux::net::recv(NetOp::Recv { + fd: self.raw_fd(), + len: buf.len(), + flags: 0, + }) + .await? + } + }; + let read = data.len(); + buf[..read].copy_from_slice(&data); + Ok(read) + } + + pub async fn read_exact(&mut self, mut buf: &mut [u8]) -> io::Result<()> { + while !buf.is_empty() { + let read = self.read(buf).await?; + if read == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "failed to fill whole buffer", + )); + } + buf = &mut buf[read..]; + } + Ok(()) + } + + pub async fn write(&mut self, buf: &[u8]) -> io::Result { + match self.write_timeout_value() { + Some(timeout) => { + crate::sys::linux::net::send_timeout(self.raw_fd(), buf.to_vec(), 0, timeout).await + } + None => { + crate::sys::linux::net::send(NetOp::Send { + fd: self.raw_fd(), + data: buf.to_vec(), + flags: 0, + }) + .await + } + } + } + + pub async fn write_all(&mut self, mut buf: &[u8]) -> io::Result<()> { + while !buf.is_empty() { + let written = self.write(buf).await?; + if written == 0 { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "failed to write whole buffer", + )); + } + buf = &buf[written..]; + } + Ok(()) + } + + pub async fn shutdown(&self, how: Shutdown) -> io::Result<()> { + crate::sys::linux::net::shutdown(NetOp::Shutdown { + fd: self.raw_fd(), + how, + }) + .await + } + + pub async fn try_clone(&self) -> io::Result { + crate::sys::linux::net::duplicate(self.raw_fd()) + .await + .map(Self::from_owned_fd) + } + + pub fn local_addr(&self) -> io::Result { + crate::sys::linux::net::local_addr(self.raw_fd()) + } + + pub fn peer_addr(&self) -> io::Result { + crate::sys::linux::net::peer_addr(self.raw_fd()) + } + + pub fn nodelay(&self) -> io::Result { + crate::sys::linux::net::nodelay(self.raw_fd()) + } + + pub fn set_nodelay(&self, enabled: bool) -> io::Result<()> { + crate::sys::linux::net::set_nodelay(self.raw_fd(), enabled) + } + + pub fn ttl(&self) -> io::Result { + crate::sys::linux::net::ttl(self.raw_fd()) + } + + pub fn set_ttl(&self, ttl: u32) -> io::Result<()> { + crate::sys::linux::net::set_ttl(self.raw_fd(), ttl) + } + + pub fn read_timeout(&self) -> io::Result> { + Ok(self.read_timeout_value()) + } + + pub fn set_read_timeout(&self, timeout: Option) -> io::Result<()> { + validate_optional_timeout(timeout)?; + self.inner.timeouts.lock().unwrap().read = timeout; + Ok(()) + } + + pub fn write_timeout(&self) -> io::Result> { + Ok(self.write_timeout_value()) + } + + pub fn set_write_timeout(&self, timeout: Option) -> io::Result<()> { + validate_optional_timeout(timeout)?; + self.inner.timeouts.lock().unwrap().write = timeout; + Ok(()) + } + + fn from_owned_fd(fd: OwnedFd) -> Self { + Self { + inner: Arc::new(TcpStreamInner { + fd, + timeouts: Mutex::new(SocketTimeouts::default()), + }), + pending_read: None, + pending_write: None, + pending_shutdown: None, + } + } + + fn raw_fd(&self) -> RawFd { + self.inner.fd.as_raw_fd() + } + + fn read_timeout_value(&self) -> Option { + self.inner.timeouts.lock().unwrap().read + } + + fn write_timeout_value(&self) -> Option { + self.inner.timeouts.lock().unwrap().write + } +} + +impl TcpListener { + pub async fn bind(addr: A) -> io::Result + where + A: ToSocketAddrs + Send + 'static, + { + let addrs = crate::sys::linux::net::resolve_addrs(addr).await?; + let mut last_error = None; + for addr in addrs { + match crate::sys::linux::net::bind_listener(addr, None).await { + Ok(fd) => return Ok(Self::from_owned_fd(fd)), + Err(error) => last_error = Some(error), + } + } + + Err(last_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::AddrNotAvailable, + "address resolution returned no usable listener endpoints", + ) + })) + } + + pub async fn accept(&self) -> io::Result<(TcpStream, SocketAddr)> { + let accepted = crate::sys::linux::net::accept(NetOp::Accept { fd: self.raw_fd() }).await?; + + let stream = TcpStream::from_owned_fd(unsafe { OwnedFd::from_raw_fd(accepted.fd) }); + Ok((stream, accepted.peer_addr)) + } + + pub fn local_addr(&self) -> io::Result { + crate::sys::linux::net::local_addr(self.raw_fd()) + } + + pub fn ttl(&self) -> io::Result { + crate::sys::linux::net::ttl(self.raw_fd()) + } + + pub fn set_ttl(&self, ttl: u32) -> io::Result<()> { + crate::sys::linux::net::set_ttl(self.raw_fd(), ttl) + } + + fn from_owned_fd(fd: OwnedFd) -> Self { + Self { + inner: Arc::new(TcpListenerInner { fd }), + } + } + + fn raw_fd(&self) -> RawFd { + self.inner.fd.as_raw_fd() + } +} + +impl UdpSocket { + pub async fn bind(addr: A) -> io::Result + where + A: ToSocketAddrs + Send + 'static, + { + let addrs = crate::sys::linux::net::resolve_addrs(addr).await?; + let mut last_error = None; + for addr in addrs { + match crate::sys::linux::net::bind_datagram(addr).await { + Ok(fd) => return Ok(Self::from_owned_fd(fd)), + Err(error) => last_error = Some(error), + } + } + + Err(last_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::AddrNotAvailable, + "address resolution returned no usable UDP endpoints", + ) + })) + } + + pub async fn connect(&self, addr: A) -> io::Result<()> + where + A: ToSocketAddrs + Send + 'static, + { + let addrs = crate::sys::linux::net::resolve_addrs(addr).await?; + let mut last_error = None; + for addr in addrs { + match crate::sys::linux::net::connect(NetOp::Connect { + fd: self.raw_fd(), + addr, + }) + .await + { + Ok(()) => return Ok(()), + Err(error) => last_error = Some(error), + } + } + + Err(last_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::AddrNotAvailable, + "address resolution returned no usable UDP peers", + ) + })) + } + + pub async fn send(&self, buf: &[u8]) -> io::Result { + match self.write_timeout_value() { + Some(timeout) => { + crate::sys::linux::net::send_timeout(self.raw_fd(), buf.to_vec(), 0, timeout).await + } + None => { + crate::sys::linux::net::send(NetOp::Send { + fd: self.raw_fd(), + data: buf.to_vec(), + flags: 0, + }) + .await + } + } + } + + pub async fn recv(&self, buf: &mut [u8]) -> io::Result { + let data = match self.read_timeout_value() { + Some(timeout) => { + crate::sys::linux::net::recv_timeout(self.raw_fd(), buf.len(), 0, timeout).await? + } + None => { + crate::sys::linux::net::recv(NetOp::Recv { + fd: self.raw_fd(), + len: buf.len(), + flags: 0, + }) + .await? + } + }; + let read = data.len(); + buf[..read].copy_from_slice(&data); + Ok(read) + } + + pub async fn peek(&self, buf: &mut [u8]) -> io::Result { + let data = match self.read_timeout_value() { + Some(timeout) => { + crate::sys::linux::net::recv_timeout( + self.raw_fd(), + buf.len(), + libc::MSG_PEEK, + timeout, + ) + .await? + } + None => { + crate::sys::linux::net::recv(NetOp::Recv { + fd: self.raw_fd(), + len: buf.len(), + flags: libc::MSG_PEEK, + }) + .await? + } + }; + let read = data.len(); + buf[..read].copy_from_slice(&data); + Ok(read) + } + + pub async fn send_to(&self, buf: &[u8], addr: A) -> io::Result + where + A: ToSocketAddrs + Send + 'static, + { + let addrs = crate::sys::linux::net::resolve_addrs(addr).await?; + let mut last_error = None; + let timeout = self.write_timeout_value(); + for addr in addrs { + let result = match timeout { + Some(timeout) => { + crate::sys::linux::net::send_to_timeout( + self.raw_fd(), + buf.to_vec(), + addr, + 0, + timeout, + ) + .await + } + None => { + crate::sys::linux::net::send_to(NetOp::SendTo { + fd: self.raw_fd(), + target: addr, + data: buf.to_vec(), + flags: 0, + }) + .await + } + }; + match result { + Ok(sent) => return Ok(sent), + Err(error) => last_error = Some(error), + } + } + + Err(last_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::AddrNotAvailable, + "address resolution returned no usable UDP destinations", + ) + })) + } + + pub async fn recv_from(&self, buf: &mut [u8]) -> io::Result<(usize, SocketAddr)> { + let datagram = match self.read_timeout_value() { + Some(timeout) => { + crate::sys::linux::net::recv_from_timeout(self.raw_fd(), buf.len(), 0, timeout) + .await? + } + None => { + crate::sys::linux::net::recv_from(NetOp::RecvFrom { + fd: self.raw_fd(), + len: buf.len(), + flags: 0, + }) + .await? + } + }; + let read = datagram.data.len(); + buf[..read].copy_from_slice(&datagram.data); + Ok((read, datagram.peer_addr)) + } + + pub async fn peek_from(&self, buf: &mut [u8]) -> io::Result<(usize, SocketAddr)> { + let datagram = match self.read_timeout_value() { + Some(timeout) => { + crate::sys::linux::net::recv_from_timeout( + self.raw_fd(), + buf.len(), + libc::MSG_PEEK, + timeout, + ) + .await? + } + None => { + crate::sys::linux::net::recv_from(NetOp::RecvFrom { + fd: self.raw_fd(), + len: buf.len(), + flags: libc::MSG_PEEK, + }) + .await? + } + }; + let read = datagram.data.len(); + buf[..read].copy_from_slice(&datagram.data); + Ok((read, datagram.peer_addr)) + } + + pub async fn try_clone(&self) -> io::Result { + crate::sys::linux::net::duplicate(self.raw_fd()) + .await + .map(Self::from_owned_fd) + } + + pub fn local_addr(&self) -> io::Result { + crate::sys::linux::net::local_addr(self.raw_fd()) + } + + pub fn peer_addr(&self) -> io::Result { + crate::sys::linux::net::peer_addr(self.raw_fd()) + } + + pub fn broadcast(&self) -> io::Result { + crate::sys::linux::net::broadcast(self.raw_fd()) + } + + pub fn set_broadcast(&self, enabled: bool) -> io::Result<()> { + crate::sys::linux::net::set_broadcast(self.raw_fd(), enabled) + } + + pub fn ttl(&self) -> io::Result { + crate::sys::linux::net::ttl(self.raw_fd()) + } + + pub fn set_ttl(&self, ttl: u32) -> io::Result<()> { + crate::sys::linux::net::set_ttl(self.raw_fd(), ttl) + } + + pub fn read_timeout(&self) -> io::Result> { + Ok(self.read_timeout_value()) + } + + pub fn set_read_timeout(&self, timeout: Option) -> io::Result<()> { + validate_optional_timeout(timeout)?; + self.inner.timeouts.lock().unwrap().read = timeout; + Ok(()) + } + + pub fn write_timeout(&self) -> io::Result> { + Ok(self.write_timeout_value()) + } + + pub fn set_write_timeout(&self, timeout: Option) -> io::Result<()> { + validate_optional_timeout(timeout)?; + self.inner.timeouts.lock().unwrap().write = timeout; + Ok(()) + } + + fn from_owned_fd(fd: OwnedFd) -> Self { + Self { + inner: Arc::new(UdpSocketInner { + fd, + timeouts: Mutex::new(SocketTimeouts::default()), + }), + } + } + + fn raw_fd(&self) -> RawFd { + self.inner.fd.as_raw_fd() + } + + fn read_timeout_value(&self) -> Option { + self.inner.timeouts.lock().unwrap().read + } + + fn write_timeout_value(&self) -> Option { + self.inner.timeouts.lock().unwrap().write + } +} + +impl HyperRead for TcpStream { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + mut buf: ReadBufCursor<'_>, + ) -> Poll> { + let this = self.get_mut(); + if buf.remaining() == 0 { + return Poll::Ready(Ok(())); + } + + if this.pending_read.is_none() { + this.pending_read = Some(match this.read_timeout_value() { + Some(timeout) => Box::pin(crate::sys::linux::net::recv_timeout( + this.raw_fd(), + buf.remaining(), + 0, + timeout, + )), + None => crate::sys::linux::net::recv_future(this.raw_fd(), buf.remaining()), + }); + } + + let poll = this + .pending_read + .as_mut() + .expect("pending read future should exist") + .as_mut() + .poll(cx); + match poll { + Poll::Ready(Ok(data)) => { + this.pending_read = None; + buf.put_slice(&data); + Poll::Ready(Ok(())) + } + Poll::Ready(Err(error)) => { + this.pending_read = None; + Poll::Ready(Err(error)) + } + Poll::Pending => Poll::Pending, + } + } +} + +impl HyperWrite for TcpStream { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + let this = self.get_mut(); + if buf.is_empty() { + return Poll::Ready(Ok(0)); + } + + if this.pending_write.is_none() { + this.pending_write = Some(match this.write_timeout_value() { + Some(timeout) => Box::pin(crate::sys::linux::net::send_timeout( + this.raw_fd(), + buf.to_vec(), + 0, + timeout, + )), + None => crate::sys::linux::net::send_future(this.raw_fd(), buf.to_vec()), + }); + } + + let poll = this + .pending_write + .as_mut() + .expect("pending write future should exist") + .as_mut() + .poll(cx); + match poll { + Poll::Ready(Ok(written)) => { + this.pending_write = None; + Poll::Ready(Ok(written)) + } + Poll::Ready(Err(error)) => { + this.pending_write = None; + Poll::Ready(Err(error)) + } + Poll::Pending => Poll::Pending, + } + } + + fn poll_flush(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + if this.pending_shutdown.is_none() { + this.pending_shutdown = Some(crate::sys::linux::net::shutdown_future( + this.raw_fd(), + Shutdown::Write, + )); + } + + let poll = this + .pending_shutdown + .as_mut() + .expect("pending shutdown future should exist") + .as_mut() + .poll(cx); + match poll { + Poll::Ready(Ok(())) => { + this.pending_shutdown = None; + Poll::Ready(Ok(())) + } + Poll::Ready(Err(error)) => { + this.pending_shutdown = None; + Poll::Ready(Err(error)) + } + Poll::Pending => Poll::Pending, + } + } +} + +fn validate_optional_timeout(timeout: Option) -> io::Result<()> { + if let Some(timeout) = timeout { + validate_timeout(timeout)?; + } + Ok(()) +} + +fn validate_timeout(timeout: Duration) -> io::Result<()> { + if timeout.is_zero() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "zero-duration timeouts are not supported", + )); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + use std::time::Duration; + + use crate::{queue_future, queue_task, run}; + + use super::{TcpListener, TcpStream, UdpSocket}; + use std::io::ErrorKind; + use std::net::SocketAddr; + + #[test] + fn tcp_listener_and_stream_round_trip() { + let received = Arc::new(Mutex::new(None::>)); + let received_for_task = Arc::clone(&received); + + queue_task(move || { + let received_for_task = Arc::clone(&received_for_task); + queue_future(async move { + let listener = Arc::new( + TcpListener::bind(SocketAddr::from(([127, 0, 0, 1], 0))) + .await + .expect("listener should bind"), + ); + let local_addr = listener + .local_addr() + .expect("listener should expose address"); + + let listener_for_accept = Arc::clone(&listener); + let server = queue_future(async move { + let (mut stream, peer_addr) = listener_for_accept + .accept() + .await + .expect("listener should accept"); + assert_eq!(peer_addr.ip().to_string(), "127.0.0.1"); + + let mut buffer = [0; 32]; + let read = stream + .read(&mut buffer) + .await + .expect("server read should succeed"); + stream + .write_all(b"pong") + .await + .expect("server write should succeed"); + buffer[..read].to_vec() + }); + + let mut client = TcpStream::connect(local_addr) + .await + .expect("client should connect"); + client + .set_nodelay(true) + .expect("setting TCP_NODELAY should succeed"); + assert!( + client + .nodelay() + .expect("reading TCP_NODELAY should succeed"), + "TCP_NODELAY should be enabled", + ); + client + .write_all(b"ping") + .await + .expect("client write should succeed"); + let mut response = [0; 4]; + client + .read_exact(&mut response) + .await + .expect("client read should succeed"); + assert_eq!(&response, b"pong"); + + let server_bytes = server.await; + *received_for_task + .lock() + .expect("received buffer should not be poisoned") = Some(server_bytes); + }); + }); + run(); + + let received = received + .lock() + .expect("received buffer should not be poisoned"); + assert_eq!(received.as_deref(), Some(b"ping".as_slice())); + } + + #[test] + fn tcp_connect_resolves_localhost() { + let peer = Arc::new(Mutex::new(None::)); + let peer_for_task = Arc::clone(&peer); + + queue_task(move || { + let peer_for_task = Arc::clone(&peer_for_task); + queue_future(async move { + let listener = Arc::new( + TcpListener::bind(SocketAddr::from(([127, 0, 0, 1], 0))) + .await + .expect("listener should bind"), + ); + let port = listener + .local_addr() + .expect("listener should expose address") + .port(); + + let listener_for_accept = Arc::clone(&listener); + let server = queue_future(async move { + let (stream, peer_addr) = listener_for_accept + .accept() + .await + .expect("listener should accept"); + drop(stream); + peer_addr + }); + + let _client = TcpStream::connect(format!("localhost:{port}")) + .await + .expect("localhost DNS connect should succeed"); + let peer_addr = server.await; + *peer_for_task + .lock() + .expect("peer buffer should not be poisoned") = + Some(peer_addr.ip().to_string()); + }); + }); + run(); + + let peer = peer.lock().expect("peer buffer should not be poisoned"); + assert_eq!(peer.as_deref(), Some("127.0.0.1")); + } + + #[test] + fn udp_send_to_and_recv_from_round_trip() { + let server_received = Arc::new(Mutex::new(None::>)); + let server_received_for_task = Arc::clone(&server_received); + + queue_task(move || { + let server_received_for_task = Arc::clone(&server_received_for_task); + queue_future(async move { + let server = UdpSocket::bind(SocketAddr::from(([127, 0, 0, 1], 0))) + .await + .expect("server udp socket should bind"); + let client = UdpSocket::bind(SocketAddr::from(([127, 0, 0, 1], 0))) + .await + .expect("client udp socket should bind"); + + server + .set_broadcast(true) + .expect("enabling broadcast should succeed"); + assert!( + server + .broadcast() + .expect("reading broadcast should succeed"), + "broadcast should be enabled", + ); + client.set_ttl(42).expect("setting ttl should succeed"); + assert_eq!(client.ttl().expect("reading ttl should succeed"), 42); + + let server_addr = server.local_addr().expect("server should expose address"); + let client_addr = client.local_addr().expect("client should expose address"); + + let server_task = queue_future(async move { + let mut peek_buffer = [0; 32]; + let (peeked, peek_peer) = server + .peek_from(&mut peek_buffer) + .await + .expect("server peek_from should succeed"); + assert_eq!(&peek_buffer[..peeked], b"ping"); + assert_eq!(peek_peer, client_addr); + + let mut buffer = [0; 32]; + let (read, peer) = server + .recv_from(&mut buffer) + .await + .expect("server recv_from should succeed"); + assert_eq!(peer, client_addr); + server + .send_to(b"pong", peer) + .await + .expect("server send_to should succeed"); + buffer[..read].to_vec() + }); + + client + .send_to(b"ping", server_addr) + .await + .expect("client send_to should succeed"); + let mut response = [0; 32]; + let (read, peer) = client + .recv_from(&mut response) + .await + .expect("client recv_from should succeed"); + assert_eq!(peer, server_addr); + assert_eq!(&response[..read], b"pong"); + + let received = server_task.await; + *server_received_for_task.lock().unwrap() = Some(received); + }); + }); + run(); + + let server_received = server_received.lock().unwrap(); + assert_eq!(server_received.as_deref(), Some(b"ping".as_slice())); + } + + #[test] + fn udp_connected_sockets_and_timeouts_work() { + let observed = Arc::new(Mutex::new(Vec::new())); + let observed_for_task = Arc::clone(&observed); + + queue_task(move || { + let observed_for_task = Arc::clone(&observed_for_task); + queue_future(async move { + let server = UdpSocket::bind(SocketAddr::from(([127, 0, 0, 1], 0))) + .await + .expect("server udp socket should bind"); + let client = UdpSocket::bind(SocketAddr::from(([127, 0, 0, 1], 0))) + .await + .expect("client udp socket should bind"); + + let server_addr = server.local_addr().expect("server should expose address"); + let client_addr = client.local_addr().expect("client should expose address"); + + client + .connect(server_addr) + .await + .expect("client udp connect should succeed"); + server + .connect(client_addr) + .await + .expect("server udp connect should succeed"); + + client + .set_read_timeout(Some(Duration::from_millis(5))) + .expect("setting read timeout should succeed"); + assert_eq!( + client + .read_timeout() + .expect("reading read timeout should succeed"), + Some(Duration::from_millis(5)) + ); + + let mut buffer = [0; 16]; + let error = client + .recv(&mut buffer) + .await + .expect_err("recv should time out before any datagram arrives"); + assert_eq!(error.kind(), ErrorKind::TimedOut); + observed_for_task + .lock() + .unwrap() + .push("timed out".to_string()); + + server + .send(b"hello") + .await + .expect("server send should succeed"); + + let peeked = client.peek(&mut buffer).await.expect("peek should succeed"); + assert_eq!(&buffer[..peeked], b"hello"); + + let read = client.recv(&mut buffer).await.expect("recv should succeed"); + assert_eq!(&buffer[..read], b"hello"); + observed_for_task + .lock() + .unwrap() + .push("received".to_string()); + }); + }); + run(); + + let observed = observed.lock().unwrap(); + assert_eq!(observed.as_slice(), ["timed out", "received"]); + } +} diff --git a/lib/runtime/src/op/completion.rs b/lib/runtime/src/op/completion.rs new file mode 100644 index 0000000..b52e2cc --- /dev/null +++ b/lib/runtime/src/op/completion.rs @@ -0,0 +1,147 @@ +#![allow(dead_code)] + +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll, Waker}; + +use crate::platform::linux_x86_64::runtime::{ThreadHandle, current_thread_handle}; + +type CancelCallback = Box; + +struct CompletionState { + owner: ThreadHandle, + interested: AtomicBool, + finished: AtomicBool, + wake_queued: AtomicBool, + result: Mutex>, + waker: Mutex>, + cancel: Mutex>, +} + +impl CompletionState { + fn queue_wake(self: &Arc) { + if self.wake_queued.swap(true, Ordering::AcqRel) { + return; + } + + let state = Arc::clone(self); + if !self.owner.queue_microtask(move || { + state.wake_queued.store(false, Ordering::Release); + if let Some(waker) = state.waker.lock().unwrap().take() { + waker.wake(); + } + }) { + self.wake_queued.store(false, Ordering::Release); + } + } +} + +pub(crate) struct CompletionFuture { + state: Arc>, +} + +pub(crate) struct CompletionHandle { + state: Arc>, +} + +impl Clone for CompletionHandle { + fn clone(&self) -> Self { + Self { + state: Arc::clone(&self.state), + } + } +} + +pub(crate) fn completion( + owner: ThreadHandle, +) -> (CompletionFuture, CompletionHandle) { + owner.begin_async_operation(); + let state = Arc::new(CompletionState { + owner, + interested: AtomicBool::new(true), + finished: AtomicBool::new(false), + wake_queued: AtomicBool::new(false), + result: Mutex::new(None), + waker: Mutex::new(None), + cancel: Mutex::new(None), + }); + + ( + CompletionFuture { + state: Arc::clone(&state), + }, + CompletionHandle { state }, + ) +} + +pub(crate) fn completion_for_current_thread() +-> (CompletionFuture, CompletionHandle) { + completion(current_thread_handle()) +} + +impl CompletionHandle { + pub(crate) fn complete(self, value: T) { + self.finish(Some(value)); + } + + pub(crate) fn finish(self, value: Option) { + if self.state.finished.swap(true, Ordering::AcqRel) { + return; + } + + let interested = self.state.interested.load(Ordering::Acquire); + if interested { + *self.state.result.lock().unwrap() = value; + self.state.queue_wake(); + } + + let _ = self.state.cancel.lock().unwrap().take(); + self.state.owner.finish_async_operation(); + } + + pub(crate) fn set_cancel(&self, cancel: impl FnOnce() + Send + 'static) { + *self.state.cancel.lock().unwrap() = Some(Box::new(cancel)); + } + + pub(crate) fn is_interested(&self) -> bool { + self.state.interested.load(Ordering::Acquire) + } +} + +impl Future for CompletionFuture { + type Output = T; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + if let Some(value) = self.state.result.lock().unwrap().take() { + return Poll::Ready(value); + } + + *self.state.waker.lock().unwrap() = Some(cx.waker().clone()); + + if let Some(value) = self.state.result.lock().unwrap().take() { + let _ = self.state.waker.lock().unwrap().take(); + return Poll::Ready(value); + } + + Poll::Pending + } +} + +impl Drop for CompletionFuture { + fn drop(&mut self) { + if !self.state.interested.swap(false, Ordering::AcqRel) { + return; + } + + let _ = self.state.result.lock().unwrap().take(); + let _ = self.state.waker.lock().unwrap().take(); + + if !self.state.finished.load(Ordering::Acquire) + && let Some(cancel) = self.state.cancel.lock().unwrap().take() + { + cancel(); + } + } +} diff --git a/lib/runtime/src/op/fs.rs b/lib/runtime/src/op/fs.rs new file mode 100644 index 0000000..78264a5 --- /dev/null +++ b/lib/runtime/src/op/fs.rs @@ -0,0 +1,105 @@ +//! Logical filesystem operations. +//! +//! This layer owns request data so the public API can keep borrowed buffers while platform +//! backends pin, stage, or offload as needed. + +use std::ffi::OsString; +use std::os::fd::RawFd; +use std::path::PathBuf; + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct OpenOptions { + pub read: bool, + pub write: bool, + pub append: bool, + pub truncate: bool, + pub create: bool, + pub create_new: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum MetadataTarget { + Path(PathBuf), + File(RawFd), +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum FileType { + File, + Directory, + Symlink, + BlockDevice, + CharacterDevice, + Fifo, + Socket, + Unknown, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RawMetadata { + pub file_type: FileType, + pub mode: u16, + pub len: u64, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RawDirEntry { + pub path: PathBuf, + pub file_name: OsString, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum FsOp { + Open { + path: PathBuf, + options: OpenOptions, + }, + Read { + fd: RawFd, + offset: Option, + len: usize, + }, + Write { + fd: RawFd, + offset: Option, + data: Vec, + }, + Metadata { + target: MetadataTarget, + follow_symlinks: bool, + }, + SetLen { + fd: RawFd, + len: u64, + }, + SyncAll { + fd: RawFd, + }, + SyncData { + fd: RawFd, + }, + Duplicate { + fd: RawFd, + }, + CreateDir { + path: PathBuf, + recursive: bool, + mode: u32, + }, + RemoveFile { + path: PathBuf, + }, + RemoveDir { + path: PathBuf, + }, + Rename { + from: PathBuf, + to: PathBuf, + }, + ReadDir { + path: PathBuf, + }, + Close { + fd: RawFd, + }, +} diff --git a/lib/runtime/src/op/mod.rs b/lib/runtime/src/op/mod.rs new file mode 100644 index 0000000..9b68e94 --- /dev/null +++ b/lib/runtime/src/op/mod.rs @@ -0,0 +1,8 @@ +//! Internal and public operation-layer building blocks. +//! +//! The operation layer defines logical work units that bridge user-facing APIs and platform +//! backends without leaking platform details upward. + +pub(crate) mod completion; +pub mod fs; +pub mod net; diff --git a/lib/runtime/src/op/net.rs b/lib/runtime/src/op/net.rs new file mode 100644 index 0000000..fc9f87e --- /dev/null +++ b/lib/runtime/src/op/net.rs @@ -0,0 +1,69 @@ +//! Logical networking operations shared between the public API and Linux backend. + +use std::net::{Shutdown, SocketAddr}; +use std::os::fd::RawFd; + +#[derive(Debug)] +pub enum NetOp { + Socket { + domain: i32, + socket_type: i32, + protocol: i32, + flags: u32, + }, + Connect { + fd: RawFd, + addr: SocketAddr, + }, + Bind { + fd: RawFd, + addr: SocketAddr, + }, + Listen { + fd: RawFd, + backlog: i32, + }, + Accept { + fd: RawFd, + }, + Send { + fd: RawFd, + data: Vec, + flags: i32, + }, + SendTo { + fd: RawFd, + target: SocketAddr, + data: Vec, + flags: i32, + }, + Recv { + fd: RawFd, + len: usize, + flags: i32, + }, + RecvFrom { + fd: RawFd, + len: usize, + flags: i32, + }, + Shutdown { + fd: RawFd, + how: Shutdown, + }, + Close { + fd: RawFd, + }, +} + +#[derive(Clone, Debug)] +pub struct AcceptedSocket { + pub fd: RawFd, + pub peer_addr: SocketAddr, +} + +#[derive(Clone, Debug)] +pub struct ReceivedDatagram { + pub data: Vec, + pub peer_addr: SocketAddr, +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/allocator.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/allocator.rs new file mode 100644 index 0000000..d7051d0 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/allocator.rs @@ -0,0 +1,864 @@ +use core::alloc::Layout; +use core::mem::size_of; +use core::ptr::copy_nonoverlapping; +use core::sync::atomic::{AtomicU32, Ordering}; + +use super::arena::Arena; +use super::constants::{ + MAX_ATTACHED_MINIHEAPS_PER_CLASS, MAX_SMALL_ALLOCATION, MIN_SHUFFLE_VECTOR_LENGTH, + MINIHEAP_REFILL_GOAL_SIZE, NUM_SIZE_CLASSES, is_below_partial_threshold, +}; +use super::fault::{self, ActiveMeshGuard}; +use super::meshing::bitmaps_meshable; +use super::miniheap::{MiniHeap, MiniHeapId}; +use super::page::{page_count, page_size, runtime_slots_per_span}; +use super::platform; +use super::pool::MiniHeapPool; +use super::raw_sys; +use super::rng::Mwc; +use super::shuffle::ShuffleEntry; +use super::size_map::{byte_size_for_class, size_class_for}; +use super::stats::{MeshStats, StatsState}; +use super::sync::{FutexMutex, futex_wait_for_value, futex_wake_all}; +use super::thread_local_heap::ThreadLocalHeap; + +#[derive(Debug)] +pub struct MeshAllocator { + arena: Arena, + pool: MiniHeapPool, + bootstrap_thread: *mut ThreadLocalHeap, + compaction_candidates: *mut MiniHeapId, + meshing_rng: Mwc, + mesh_epoch: AtomicU32, + pool_lock: FutexMutex, + stats: StatsState, +} + +#[derive(Clone, Copy, Debug)] +struct ResolvedPtr { + owner_id: MiniHeapId, + slot: usize, +} + +impl MeshAllocator { + pub fn new(arena_size: usize, miniheap_capacity: u32) -> raw_sys::Result { + fault::ensure_fault_mediation_installed()?; + let bootstrap_thread = unsafe { + platform::map_anonymous( + size_of::(), + raw_sys::PROT_READ | raw_sys::PROT_WRITE, + )? as *mut ThreadLocalHeap + }; + unsafe { + bootstrap_thread.write(ThreadLocalHeap::new()?); + } + let compaction_candidates = unsafe { + platform::map_anonymous( + miniheap_capacity as usize * size_of::(), + raw_sys::PROT_READ | raw_sys::PROT_WRITE, + )? as *mut MiniHeapId + }; + + Ok(Self { + arena: Arena::with_size(arena_size)?, + pool: MiniHeapPool::with_capacity(miniheap_capacity)?, + bootstrap_thread, + compaction_candidates, + meshing_rng: Mwc::from_os_seed()?, + mesh_epoch: AtomicU32::new(0), + pool_lock: FutexMutex::new(), + stats: StatsState::new(), + }) + } + + #[inline(always)] + pub fn arena(&self) -> &Arena { + &self.arena + } + + #[inline(always)] + pub fn pool(&self) -> &MiniHeapPool { + &self.pool + } + + #[inline(always)] + pub fn live_miniheap_count(&self) -> u32 { + self.pool.live_len() + } + + pub fn stats(&self) -> MeshStats { + let page = page_size(); + let (reusable_span_count, reusable_pages) = self.arena.reusable_span_stats(); + let counters = self.stats.snapshot(); + let mut stats = MeshStats { + arena_size: self.arena.arena_size(), + reserved_bytes: self.arena.reserved_pages() as usize * page, + reusable_span_count, + reusable_span_bytes: reusable_pages as usize * page, + live_miniheaps: self.pool.live_len(), + small_allocations: counters.small_allocations, + small_deallocations: counters.small_deallocations, + large_allocations: counters.large_allocations, + large_deallocations: counters.large_deallocations, + compact_calls: counters.compact_calls, + meshes_performed: counters.meshes_performed, + meshed_pages: counters.meshed_pages, + meshed_bytes: counters.meshed_bytes, + ..MeshStats::default() + }; + + let mut candidate_heaps_by_class = [0u32; NUM_SIZE_CLASSES]; + let mut candidate_pages_by_class = [0u32; NUM_SIZE_CLASSES]; + let mut candidate_free_bytes_by_class = [0usize; NUM_SIZE_CLASSES]; + let mut candidate_span_bytes_by_class = [0usize; NUM_SIZE_CLASSES]; + + let len = self.pool.len(); + let mut id_value = 1u32; + while id_value <= len { + let id = MiniHeapId::new(id_value); + if let Some(heap) = self.pool.get(id) { + if heap.is_large_alloc() { + stats.live_large_allocations += 1; + stats.live_large_bytes += heap.span_size(); + stats.retained_large_span_bytes += heap.span_size(); + id_value += 1; + continue; + } + + stats.live_small_heaps += 1; + stats.live_small_bytes += heap.in_use_count() as usize * heap.object_size(); + stats.virtual_small_span_bytes += heap.span_size(); + + if heap.is_meshed() { + stats.meshed_small_heaps += 1; + } else { + stats.retained_small_span_bytes += heap.span_size(); + } + + if heap.is_full() { + stats.full_small_heaps += 1; + } else if !heap.is_empty() { + stats.partial_small_heaps += 1; + } + + if !heap.is_attached() && !heap.is_full() && !heap.is_meshed() { + stats.reusable_small_heaps += 1; + } + + if self.heap_is_compaction_candidate(heap.size_class(), heap) { + let class = heap.size_class() as usize; + stats.compaction.candidate_heaps += 1; + stats.compaction.candidate_pages += heap.span().length; + stats.compaction.candidate_free_bytes += heap.bytes_free(); + candidate_heaps_by_class[class] += 1; + candidate_pages_by_class[class] += heap.span().length; + candidate_free_bytes_by_class[class] += heap.bytes_free(); + candidate_span_bytes_by_class[class] = heap.span_size(); + } + } + id_value += 1; + } + + let mut class = 1usize; + while class < NUM_SIZE_CLASSES { + let span_bytes = candidate_span_bytes_by_class[class]; + if let Some(pair_bound_by_free) = + candidate_free_bytes_by_class[class].checked_div(span_bytes) + { + let pair_bound_by_count = candidate_heaps_by_class[class] / 2; + let best_case_meshes = pair_bound_by_count.min(pair_bound_by_free as u32); + let pages_per_mesh = + candidate_pages_by_class[class] / candidate_heaps_by_class[class].max(1); + stats.compaction.best_case_meshes += best_case_meshes; + stats.compaction.best_case_reclaimable_pages += best_case_meshes * pages_per_mesh; + stats.compaction.best_case_reclaimable_bytes += + best_case_meshes as usize * span_bytes; + } + class += 1; + } + + stats + } + + pub fn allocate(&mut self, size: usize) -> Option<*mut u8> { + let thread_heap = unsafe { &mut *self.bootstrap_thread }; + self.allocate_with_thread(size, thread_heap) + } + + pub fn allocate_with_thread( + &mut self, + size: usize, + thread_heap: &mut ThreadLocalHeap, + ) -> Option<*mut u8> { + let size = size.max(1); + if size <= MAX_SMALL_ALLOCATION { + let class = size_class_for(size)?; + if let Some(ptr) = self.try_allocate_small_local(thread_heap, class) { + return Some(ptr); + } + return self.allocate_small_with_thread(thread_heap, class); + } + + self.allocate_large(size) + } + + pub fn allocate_layout(&mut self, layout: Layout) -> Option<*mut u8> { + let thread_heap = unsafe { &mut *self.bootstrap_thread }; + self.allocate_layout_with_thread(thread_heap, layout) + } + + pub fn allocate_layout_with_thread( + &mut self, + thread_heap: &mut ThreadLocalHeap, + layout: Layout, + ) -> Option<*mut u8> { + let aligned_size = round_up_to_alignment(layout.size().max(1), layout.align())?; + if aligned_size <= MAX_SMALL_ALLOCATION && layout.align() <= page_size() { + let class = size_class_for(aligned_size)?; + if byte_size_for_class(class).is_multiple_of(layout.align()) { + if let Some(ptr) = self.try_allocate_small_local(thread_heap, class) { + return Some(ptr); + } + return self.allocate_small_with_thread(thread_heap, class); + } + } + + self.allocate_large_aligned(aligned_size, layout.align()) + } + + pub fn deallocate(&mut self, ptr: *mut u8) { + let thread_heap = unsafe { &mut *self.bootstrap_thread }; + self.deallocate_with_thread(ptr, thread_heap); + } + + pub fn deallocate_with_thread(&mut self, ptr: *mut u8, thread_heap: &mut ThreadLocalHeap) { + if ptr.is_null() { + return; + } + + let Some(resolved) = self.resolve_pointer(ptr) else { + return; + }; + let id = resolved.owner_id; + let Some(heap) = self.pool.get(id) else { + return; + }; + + if heap.is_large_alloc() { + let span = heap.span(); + self.stats.record_large_deallocation(); + let _ = heap.free_offset(0); + self.arena.clear_miniheap(span); + self.arena.release_span(span); + let _ = { + let _guard = self.pool_lock.lock(); + self.pool.release(id) + }; + return; + } + + let slot = resolved.slot; + let class = heap.size_class(); + let thread_id = thread_heap.thread_id(); + self.stats.record_small_deallocation(); + + if heap.current_thread() == thread_id && heap.is_attached() && !heap.is_meshed() { + let state = thread_heap.class_mut(class); + let attached_idx = state.find_attached(id); + if let Some(attached_idx) = attached_idx + && !state.shuffle.is_full() + { + let cached = state.shuffle.count_entries_for_offset(attached_idx as u16); + if cached + 1 == heap.max_count() as usize { + self.release_class_attached(thread_heap, class); + if let Some(heap) = self.pool.get(id) { + let _ = heap.free_offset(slot); + } + self.reclaim_empty_detached_heap(id); + return; + } + if cached + 1 < heap.max_count() as usize { + state + .shuffle + .push(ShuffleEntry::new(attached_idx as u16, slot as u16)); + return; + } + } + } + + let state = thread_heap.class_mut(class); + let _ = heap.free_offset(slot); + if heap.is_attached() + && is_below_partial_threshold(heap.in_use_count(), heap.max_count() as u32) + { + heap.unset_attached(); + if let Some(attached_idx) = state.find_attached(id) { + state.attached_ids[attached_idx as usize] = MiniHeapId::new(0); + state.attached_heaps[attached_idx as usize] = core::ptr::null(); + } + } + + if heap.is_empty() && !heap.is_meshed() { + self.reclaim_empty_detached_heap(id); + } + } + + pub fn deallocate_layout(&mut self, ptr: *mut u8, _layout: Layout) { + self.deallocate(ptr); + } + + pub fn try_deallocate_local(&self, ptr: *mut u8, thread_heap: &mut ThreadLocalHeap) -> bool { + if ptr.is_null() { + return true; + } + + if self.mesh_epoch.load(Ordering::Acquire) & 1 != 0 { + return false; + } + + let Some(resolved) = self.resolve_pointer(ptr) else { + return true; + }; + let id = resolved.owner_id; + let Some(heap) = self.pool.get(id) else { + return true; + }; + if heap.is_large_alloc() + || heap.current_thread() != thread_heap.thread_id() + || heap.is_meshed() + || !heap.contains_ptr(self.arena.base_ptr() as usize, ptr) + { + return false; + } + + let class = heap.size_class(); + let slot = resolved.slot; + let state = thread_heap.class_mut(class); + let Some(attached_idx) = state.find_attached(id) else { + return false; + }; + if state.shuffle.is_full() { + return false; + } + + let cached = state.shuffle.count_entries_for_offset(attached_idx as u16); + if cached + 1 >= heap.max_count() as usize { + return false; + } + + state + .shuffle + .push(ShuffleEntry::new(attached_idx as u16, slot as u16)); + self.stats.record_small_deallocation(); + true + } + + /// # Safety + /// + /// `ptr` must have been allocated by this allocator with `layout`, and must not be used + /// after this call if a new allocation is returned. + pub unsafe fn reallocate( + &mut self, + ptr: *mut u8, + layout: Layout, + new_size: usize, + ) -> Option<*mut u8> { + let thread_heap = unsafe { &mut *self.bootstrap_thread }; + unsafe { self.reallocate_with_thread(ptr, layout, new_size, thread_heap) } + } + + /// # Safety + /// + /// `ptr` must have been allocated by this allocator with `layout`, and must not be used + /// after this call if a new allocation is returned. + pub unsafe fn reallocate_with_thread( + &mut self, + ptr: *mut u8, + layout: Layout, + new_size: usize, + thread_heap: &mut ThreadLocalHeap, + ) -> Option<*mut u8> { + if ptr.is_null() { + return self.allocate_layout_with_thread( + thread_heap, + Layout::from_size_align(new_size.max(1), layout.align()).ok()?, + ); + } + if new_size == 0 { + self.deallocate_with_thread(ptr, thread_heap); + return None; + } + + let new_layout = Layout::from_size_align(new_size, layout.align()).ok()?; + let new_ptr = self.allocate_layout_with_thread(thread_heap, new_layout)?; + unsafe { + copy_nonoverlapping(ptr, new_ptr, layout.size().min(new_size)); + } + self.deallocate_with_thread(ptr, thread_heap); + Some(new_ptr) + } + + pub fn compact(&mut self) -> usize { + let thread_heap = unsafe { &mut *self.bootstrap_thread }; + self.compact_with_thread(thread_heap) + } + + pub fn compact_with_thread(&mut self, thread_heap: &mut ThreadLocalHeap) -> usize { + let _epoch_guard = MeshingEpochGuard::new(core::ptr::addr_of!(self.mesh_epoch)); + self.stats.record_compact_call(); + self.shutdown_thread(thread_heap); + + let mut meshes = 0usize; + for class_idx in 1..NUM_SIZE_CLASSES { + meshes += self.mesh_class_candidates(class_idx as u8); + } + + meshes + } + + pub fn try_allocate_small_local( + &self, + thread_heap: &mut ThreadLocalHeap, + class: u8, + ) -> Option<*mut u8> { + if self.mesh_epoch.load(Ordering::Acquire) & 1 != 0 { + return None; + } + if thread_heap.class(class).shuffle.is_exhausted() && !self.local_refill(thread_heap, class) + { + return None; + } + + let state = thread_heap.class_mut(class); + let entry = state.shuffle.pop()?; + let heap = state.heap_at(entry.miniheap_offset as usize)?; + self.stats.record_small_allocation(); + Some(heap.ptr_from_offset(self.arena.base_ptr() as usize, entry.slot_index as usize)) + } + + fn allocate_small_with_thread( + &mut self, + thread_heap: &mut ThreadLocalHeap, + class: u8, + ) -> Option<*mut u8> { + self.global_refill(thread_heap, class)?; + self.try_allocate_small_local(thread_heap, class) + } + + fn allocate_large(&mut self, size: usize) -> Option<*mut u8> { + self.allocate_large_aligned(size, 1) + } + + fn allocate_large_aligned(&mut self, size: usize, align: usize) -> Option<*mut u8> { + let page_align = page_alignment_for(align)?; + let (_, span) = self.arena.allocate_bytes(size, page_align)?; + let (id, heap) = { + let _guard = self.pool_lock.lock(); + self.pool.allocate(span, 1, size)? + }; + self.arena.track_miniheap(span, id); + self.stats.record_large_allocation(); + heap.malloc_at(self.arena.base_ptr() as usize, 0) + } + + fn local_refill(&self, thread_heap: &mut ThreadLocalHeap, class: u8) -> bool { + let state = thread_heap.class_mut(class); + let count = state.attached_len as usize; + if count == 0 { + return false; + } + + let mut scanned = 0usize; + while scanned < count && state.shuffle.is_exhausted() { + let idx = (state.attached_cursor as usize) % count; + state.attached_cursor = ((idx + 1) % count) as u8; + let heap_ptr = state.attached_heaps[idx]; + if !heap_ptr.is_null() { + let heap = unsafe { &*heap_ptr }; + if !heap.is_full() { + let _ = state.shuffle.refill_from_heap(idx as u16, heap); + } + } + scanned += 1; + } + + !state.shuffle.is_exhausted() + } + + fn global_refill(&mut self, thread_heap: &mut ThreadLocalHeap, class: u8) -> Option<()> { + self.release_class_attached(thread_heap, class); + + let object_size = byte_size_for_class(class); + let object_count = miniheap_object_count(object_size); + let page_count = page_count(object_size * object_count) as u32; + let thread_id = thread_heap.thread_id(); + + let mut bytes_free = self.attach_reusable_heaps(thread_heap, class); + while bytes_free < MINIHEAP_REFILL_GOAL_SIZE && !thread_heap.class(class).attached_full() { + let (_, span) = self.arena.page_alloc(page_count, 1)?; + let (id, heap) = { + let _guard = self.pool_lock.lock(); + self.pool.allocate(span, object_count as u16, object_size)? + }; + self.arena.track_miniheap(span, id); + + let slot = thread_heap + .class_mut(class) + .push_attached(id, heap as *const MiniHeap)?; + heap.set_attached(thread_id); + heap.set_shuffle_vector_offset(slot); + bytes_free += heap.bytes_free(); + } + + thread_heap.class_mut(class).shuffle.clear(); + if self.local_refill(thread_heap, class) { + Some(()) + } else { + None + } + } + + fn attach_reusable_heaps(&mut self, thread_heap: &mut ThreadLocalHeap, class: u8) -> usize { + let mut bytes_free = 0usize; + let len = self.pool.len(); + let thread_id = thread_heap.thread_id(); + + let mut id_val = 1u32; + while id_val <= len + && bytes_free < MINIHEAP_REFILL_GOAL_SIZE + && !thread_heap.class(class).attached_full() + { + let id = MiniHeapId::new(id_val); + if let Some(heap) = self.pool.get(id) + && heap.size_class() == class + && !heap.is_attached() + && !heap.is_full() + && !heap.is_meshed() + && let Some(slot) = thread_heap + .class_mut(class) + .push_attached(id, heap as *const MiniHeap) + { + heap.set_attached(thread_id); + heap.set_shuffle_vector_offset(slot); + bytes_free += heap.bytes_free(); + } + id_val += 1; + } + + bytes_free + } + + fn release_all_attached(&mut self, thread_heap: &mut ThreadLocalHeap) { + for class in 1..NUM_SIZE_CLASSES as u8 { + self.release_class_attached(thread_heap, class); + } + } + + pub fn shutdown_thread(&mut self, thread_heap: &mut ThreadLocalHeap) { + self.release_all_attached(thread_heap); + } + + fn reclaim_empty_detached_heap(&mut self, id: MiniHeapId) { + let Some(heap) = self.pool.get(id) else { + return; + }; + if !heap.is_empty() || heap.is_meshed() || heap.is_attached() || heap.has_meshed_partner() { + return; + } + + let span = heap.span(); + self.arena.clear_miniheap(span); + self.arena.release_span(span); + let _ = { + let _guard = self.pool_lock.lock(); + self.pool.release(id) + }; + } + + fn release_class_attached(&mut self, thread_heap: &mut ThreadLocalHeap, class: u8) { + let mut released_ids = [MiniHeapId::new(0); MAX_ATTACHED_MINIHEAPS_PER_CLASS]; + let released_len; + { + let state = thread_heap.class_mut(class); + for entry in state.shuffle.active_entries() { + let attached_idx = entry.miniheap_offset as usize; + if attached_idx >= state.attached_len as usize { + continue; + } + + if let Some(heap) = state.heap_at(attached_idx) { + let _ = heap.free_offset(entry.slot_index as usize); + } + } + + released_len = state.attached_len as usize; + for (idx, id) in released_ids.iter_mut().enumerate().take(released_len) { + *id = state.attached_ids[idx]; + if let Some(heap) = state.heap_at(idx) { + heap.unset_attached(); + } + } + state.clear_attached(); + } + + for id in released_ids.into_iter().take(released_len) { + if id != MiniHeapId::new(0) { + self.reclaim_empty_detached_heap(id); + } + } + } + + fn heap_is_compaction_candidate(&self, class: u8, heap: &MiniHeap) -> bool { + heap.size_class() == class + && !heap.is_attached() + && !heap.is_full() + && !heap.is_meshed() + && heap.object_size() < page_size() + && is_below_partial_threshold(heap.in_use_count(), heap.max_count() as u32) + } + + fn mesh_pair(&mut self, dst_id: MiniHeapId, src_id: MiniHeapId) -> raw_sys::Result<()> { + let dst = self.pool.get(dst_id).expect("valid dst id"); + let src = self.pool.get(src_id).expect("valid src id"); + let span_size = dst.span_size(); + let arena_base = self.arena.base_ptr() as usize; + let object_size = dst.object_size(); + let src_snapshot = src.bitmap().snapshot(); + let keep = dst.ptr_from_offset(arena_base, 0); + let remove = src.ptr_from_offset(arena_base, 0); + let barrier = ActiveMeshGuard::begin(remove, span_size)?; + let scratch = match unsafe { self.arena.begin_mesh(remove, span_size) } { + Ok(scratch) => scratch, + Err(error) => { + barrier.finish(); + return Err(error); + } + }; + + for slot in src_snapshot.iter_set_bits() { + let src_ptr = unsafe { scratch.add(slot * object_size) }; + let dst_ptr = dst.ptr_from_offset(arena_base, slot); + unsafe { + copy_nonoverlapping(src_ptr, dst_ptr, object_size); + } + let _ = dst.bitmap().try_set(slot); + let _ = src.free_offset(slot); + } + + let previous_family_head = dst.next_meshed(); + src.track_meshed_span(previous_family_head); + dst.track_meshed_span(src_id); + src.set_meshed(); + + if let Err(error) = unsafe { self.arena.finalize_mesh(keep, remove, scratch, span_size) } { + let _ = unsafe { self.arena.abort_mesh(remove, scratch, span_size) }; + barrier.finish(); + return Err(error); + } + barrier.finish(); + self.arena.free_phys(remove, span_size)?; + self.stats.record_mesh(dst.span().length, span_size); + Ok(()) + } + + fn mesh_class_candidates(&mut self, class: u8) -> usize { + let mut candidate_len = self.collect_compaction_candidates(class); + if candidate_len < 2 { + return 0; + } + self.shuffle_compaction_candidates(candidate_len); + + let mut meshes = 0usize; + while candidate_len > 1 { + let left_index = candidate_len - 1; + let left_id = unsafe { *self.compaction_candidates.add(left_index) }; + candidate_len -= 1; + + let mut match_index = 0usize; + while match_index < candidate_len { + let right_id = unsafe { *self.compaction_candidates.add(match_index) }; + let mesh_result = if let (Some(left), Some(right)) = + (self.pool.get(left_id), self.pool.get(right_id)) + { + if self.heap_is_compaction_candidate(class, left) + && self.heap_is_compaction_candidate(class, right) + && bitmaps_meshable(left.bitmap(), right.bitmap()) + { + if left.has_meshed_partner() && !right.has_meshed_partner() { + self.mesh_pair(left_id, right_id) + } else if right.has_meshed_partner() && !left.has_meshed_partner() { + self.mesh_pair(right_id, left_id) + } else if !right.has_meshed_partner() { + self.mesh_pair(left_id, right_id) + } else if !left.has_meshed_partner() { + self.mesh_pair(right_id, left_id) + } else { + Err(raw_sys::Error(raw_sys::EAGAIN)) + } + } else { + Err(raw_sys::Error(raw_sys::EAGAIN)) + } + } else { + Err(raw_sys::Error(raw_sys::EAGAIN)) + }; + + if mesh_result.is_ok() { + meshes += 1; + if match_index != candidate_len - 1 { + unsafe { + let replacement = *self.compaction_candidates.add(candidate_len - 1); + self.compaction_candidates + .add(match_index) + .write(replacement); + } + } + candidate_len -= 1; + break; + } + match_index += 1; + } + } + + meshes + } + + fn collect_compaction_candidates(&mut self, class: u8) -> usize { + let len = self.pool.len(); + let mut candidate_len = 0usize; + let mut id_val = 1u32; + while id_val <= len { + let id = MiniHeapId::new(id_val); + if let Some(heap) = self.pool.get(id) + && self.heap_is_compaction_candidate(class, heap) + { + unsafe { + self.compaction_candidates.add(candidate_len).write(id); + } + candidate_len += 1; + } + id_val += 1; + } + candidate_len + } + + fn shuffle_compaction_candidates(&mut self, len: usize) { + if len <= 1 { + return; + } + + let mut index = len - 1; + while index > 0 { + let swap_index = self.meshing_rng.in_range(0, index); + unsafe { + let left = *self.compaction_candidates.add(index); + let right = *self.compaction_candidates.add(swap_index); + self.compaction_candidates.add(index).write(right); + self.compaction_candidates.add(swap_index).write(left); + } + index -= 1; + } + } + + fn resolve_pointer(&self, ptr: *mut u8) -> Option { + loop { + let start_epoch = self.mesh_epoch.load(Ordering::Acquire); + if start_epoch & 1 != 0 { + futex_wait_for_value(&self.mesh_epoch, start_epoch); + continue; + } + + let owner_id = self.arena.miniheap_id_for_ptr(ptr)?; + let slot = self.resolve_family_slot(owner_id, ptr)?; + let end_epoch = self.mesh_epoch.load(Ordering::Acquire); + if start_epoch == end_epoch { + return Some(ResolvedPtr { owner_id, slot }); + } + } + } + + fn resolve_family_slot(&self, owner_id: MiniHeapId, ptr: *mut u8) -> Option { + let arena_base = self.arena.base_ptr() as usize; + let owner = self.pool.get(owner_id)?; + if owner.contains_ptr(arena_base, ptr) { + return Some(owner.slot_for_ptr(arena_base, ptr)); + } + + let mut current = owner.next_meshed(); + while current.has_value() { + let heap = self.pool.get(current)?; + if heap.contains_ptr(arena_base, ptr) { + return Some(heap.slot_for_ptr(arena_base, ptr)); + } + current = heap.next_meshed(); + } + + None + } +} + +fn miniheap_object_count(object_size: usize) -> usize { + let bitmap_limit = runtime_slots_per_span(); + (page_size() / object_size) + .max(MIN_SHUFFLE_VECTOR_LENGTH) + .min(bitmap_limit) +} + +#[inline(always)] +fn round_up_to_alignment(size: usize, align: usize) -> Option { + debug_assert!(align.is_power_of_two()); + let mask = align - 1; + size.checked_add(mask).map(|value| value & !mask) +} + +#[inline(always)] +fn page_alignment_for(align: usize) -> Option { + let page = page_size(); + if align <= page { + return Some(1); + } + + let pages = align / page; + if pages * page != align { + return None; + } + + u32::try_from(pages).ok() +} + +struct MeshingEpochGuard { + epoch: *const AtomicU32, +} + +impl MeshingEpochGuard { + fn new(epoch: *const AtomicU32) -> MeshingEpochGuard { + let epoch_ref = unsafe { &*epoch }; + let previous = epoch_ref.fetch_add(1, Ordering::AcqRel); + debug_assert_eq!(previous & 1, 0); + MeshingEpochGuard { epoch } + } +} + +impl Drop for MeshingEpochGuard { + fn drop(&mut self) { + let epoch = unsafe { &*self.epoch }; + let previous = epoch.fetch_add(1, Ordering::AcqRel); + debug_assert_eq!(previous & 1, 1); + futex_wake_all(epoch); + } +} + +impl Drop for MeshAllocator { + fn drop(&mut self) { + unsafe { + core::ptr::drop_in_place(self.bootstrap_thread); + let _ = platform::munmap( + self.bootstrap_thread.cast::(), + size_of::(), + ); + let _ = platform::munmap( + self.compaction_candidates.cast::(), + self.pool.capacity() as usize * size_of::(), + ); + } + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/arena.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/arena.rs new file mode 100644 index 0000000..33e855b --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/arena.rs @@ -0,0 +1,428 @@ +use core::cell::UnsafeCell; +use core::mem::size_of; +use core::ptr::null_mut; +use core::sync::atomic::{AtomicU32, Ordering}; + +use super::constants::DEFAULT_ARENA_SIZE; +use super::miniheap::MiniHeapId; +use super::page::{PageConfig, page_count, round_up_to_page}; +use super::platform; +use super::raw_sys; +use super::span::Span; +use super::sync::FutexMutex; + +const MAX_FREE_SPANS: usize = 4096; + +#[derive(Debug)] +pub struct Arena { + config: PageConfig, + arena_size: usize, + page_count: u32, + fd: i32, + base: *mut u8, + owners: *mut AtomicU32, + next_page: AtomicU32, + free_spans: *mut Span, + free_span_count: UnsafeCell, + free_span_lock: FutexMutex, +} + +impl Arena { + #[inline] + pub fn new() -> raw_sys::Result { + Self::with_size(DEFAULT_ARENA_SIZE) + } + + #[inline] + pub fn with_size(arena_size: usize) -> raw_sys::Result { + let config = PageConfig::get(); + assert!(arena_size > 0); + assert_eq!(arena_size % config.size(), 0); + + let page_count = page_count(arena_size) as u32; + let fd = platform::memfd_create(c"rust-mesh-alloc".as_ptr().cast(), raw_sys::MFD_CLOEXEC)?; + platform::ftruncate(fd, arena_size as u64)?; + + let base = unsafe { + platform::mmap( + null_mut(), + arena_size, + raw_sys::PROT_READ | raw_sys::PROT_WRITE, + raw_sys::MAP_SHARED, + fd, + 0, + )? + }; + + let owner_bytes = page_count as usize * size_of::(); + let owners = unsafe { + platform::map_anonymous(owner_bytes, raw_sys::PROT_READ | raw_sys::PROT_WRITE)? + as *mut AtomicU32 + }; + let free_span_bytes = MAX_FREE_SPANS * size_of::(); + let free_spans = unsafe { + platform::map_anonymous(free_span_bytes, raw_sys::PROT_READ | raw_sys::PROT_WRITE)? + as *mut Span + }; + + Ok(Self { + config, + arena_size, + page_count, + fd, + base, + owners, + next_page: AtomicU32::new(0), + free_spans, + free_span_count: UnsafeCell::new(0), + free_span_lock: FutexMutex::new(), + }) + } + + #[inline(always)] + pub const fn config(&self) -> PageConfig { + self.config + } + + #[inline(always)] + pub const fn arena_size(&self) -> usize { + self.arena_size + } + + #[inline(always)] + pub const fn base_ptr(&self) -> *mut u8 { + self.base + } + + #[inline(always)] + pub fn contains(&self, ptr: *const u8) -> bool { + let start = self.base as usize; + let end = start + self.arena_size; + let value = ptr as usize; + start <= value && value < end + } + + #[inline] + pub fn reserve_pages(&self, page_count: u32, page_alignment: u32) -> Option { + assert!(page_count > 0); + assert!(page_alignment > 0); + let alignment = page_alignment.next_power_of_two(); + + if let Some(span) = self.take_free_span(page_count, alignment) { + return Some(span); + } + + loop { + let current = self.next_page.load(Ordering::Acquire); + let aligned = align_up_u32(current, alignment); + let end = aligned.checked_add(page_count)?; + if end > self.page_count { + return None; + } + + match self + .next_page + .compare_exchange(current, end, Ordering::AcqRel, Ordering::Acquire) + { + Ok(_) => return Some(Span::new(aligned, page_count)), + Err(_) => continue, + } + } + } + + #[inline] + pub fn page_alloc(&self, page_count: u32, page_alignment: u32) -> Option<(*mut u8, Span)> { + let span = self.reserve_pages(page_count, page_alignment)?; + Some((self.ptr_from_offset(span.offset as usize), span)) + } + + #[inline] + pub fn allocate_bytes(&self, size: usize, page_alignment: u32) -> Option<(*mut u8, Span)> { + let pages = page_count(size) as u32; + self.page_alloc(pages, page_alignment) + } + + #[inline(always)] + pub fn reserved_pages(&self) -> u32 { + self.next_page.load(Ordering::Acquire) + } + + pub fn reusable_span_stats(&self) -> (u32, u32) { + let _guard = self.free_span_lock.lock(); + let count = unsafe { *self.free_span_count.get() }; + let mut pages = 0u32; + let mut index = 0usize; + while index < count as usize { + let span = unsafe { *self.free_spans.add(index) }; + pages += span.length; + index += 1; + } + (count, pages) + } + + #[inline] + pub fn track_miniheap(&self, span: Span, id: MiniHeapId) { + for page in 0..span.length { + self.owner_at_offset(span.offset + page) + .store(id.value(), Ordering::Release); + } + } + + #[inline] + pub fn clear_miniheap(&self, span: Span) { + for page in 0..span.length { + self.owner_at_offset(span.offset + page) + .store(0, Ordering::Release); + } + } + + #[inline] + pub fn release_span(&self, span: Span) { + if span.empty() { + return; + } + + let _guard = self.free_span_lock.lock(); + let count = unsafe { &mut *self.free_span_count.get() }; + let mut merged = span; + let mut index = 0usize; + while index < *count as usize { + let other = unsafe { *self.free_spans.add(index) }; + if other.offset + other.length == merged.offset { + merged = Span::new(other.offset, other.length + merged.length); + self.remove_free_span_at(index, count); + continue; + } + if merged.offset + merged.length == other.offset { + merged = Span::new(merged.offset, merged.length + other.length); + self.remove_free_span_at(index, count); + continue; + } + index += 1; + } + + self.push_free_span(merged, count); + } + + #[inline] + pub fn miniheap_id_for_ptr(&self, ptr: *const u8) -> Option { + if !self.contains(ptr) { + return None; + } + let off = self.offset_for(ptr); + let value = self.owner_at_offset(off).load(Ordering::Acquire); + if value == 0 { + None + } else { + Some(MiniHeapId::new(value)) + } + } + + /// # Safety + /// + /// `remove..remove+size` must describe a valid, page-aligned mapping within this arena. + /// The returned alias is a private scratch mapping of the old source backing. The original + /// `remove` range is protected with `PROT_NONE` and must be restored or remapped by the + /// caller before any blocked mutators are allowed to resume. + #[inline] + pub unsafe fn begin_mesh(&self, remove: *mut u8, size: usize) -> raw_sys::Result<*mut u8> { + let rounded = round_up_to_page(size); + let remove_off = self.offset_for(remove); + unsafe { + platform::mprotect(remove, rounded, raw_sys::PROT_NONE)?; + platform::mmap( + core::ptr::null_mut(), + rounded, + raw_sys::PROT_READ, + raw_sys::MAP_SHARED, + self.fd, + (remove_off as usize * self.config.size()) as u64, + ) + } + } + + /// # Safety + /// + /// Restores the source mapping to its original backing after a failed mesh attempt. + #[inline] + pub unsafe fn abort_mesh( + &self, + remove: *mut u8, + scratch: *mut u8, + size: usize, + ) -> raw_sys::Result<()> { + let rounded = round_up_to_page(size); + let remove_off = self.offset_for(remove); + unsafe { + if !scratch.is_null() { + platform::munmap(scratch, rounded)?; + } + platform::mmap( + remove, + rounded, + raw_sys::PROT_READ | raw_sys::PROT_WRITE, + raw_sys::MAP_SHARED | raw_sys::MAP_FIXED, + self.fd, + (remove_off as usize * self.config.size()) as u64, + )?; + } + Ok(()) + } + + /// # Safety + /// + /// `keep` and `remove` must each point to valid page-aligned ranges of at least `size` + /// bytes within this arena. The caller must ensure that aliasing these ranges is valid for + /// the current allocator state and that any required object copying has already completed. + #[inline] + pub unsafe fn finalize_mesh( + &self, + keep: *mut u8, + remove: *mut u8, + scratch: *mut u8, + size: usize, + ) -> raw_sys::Result<()> { + let rounded = round_up_to_page(size); + let keep_off = self.offset_for(keep); + let remove_off = self.offset_for(remove); + let pages = page_count(rounded); + + unsafe { + platform::mmap( + remove, + rounded, + raw_sys::PROT_READ | raw_sys::PROT_WRITE, + raw_sys::MAP_SHARED | raw_sys::MAP_FIXED, + self.fd, + (keep_off as usize * self.config.size()) as u64, + )?; + if !scratch.is_null() { + platform::munmap(scratch, rounded)?; + } + } + + let keep_id = self.owner_at_offset(keep_off).load(Ordering::Acquire); + for page in 0..pages { + self.owner_at_offset(remove_off + page as u32) + .store(keep_id, Ordering::Release); + } + + Ok(()) + } + + #[inline] + pub fn free_phys(&self, ptr: *mut u8, size: usize) -> raw_sys::Result<()> { + let rounded = round_up_to_page(size); + let offset = (ptr as usize).wrapping_sub(self.base as usize); + platform::fallocate( + self.fd, + raw_sys::FALLOC_FL_PUNCH_HOLE | raw_sys::FALLOC_FL_KEEP_SIZE, + offset as u64, + rounded as u64, + ) + } + + #[inline] + pub fn reset_identity_mapping(&self, span: Span) -> raw_sys::Result<()> { + let ptr = self.ptr_from_offset(span.offset as usize); + unsafe { + platform::mmap( + ptr, + span.byte_length_for_page_size(self.config.size()), + raw_sys::PROT_READ | raw_sys::PROT_WRITE, + raw_sys::MAP_SHARED | raw_sys::MAP_FIXED, + self.fd, + (span.offset as usize * self.config.size()) as u64, + )?; + } + Ok(()) + } + + #[inline(always)] + pub fn offset_for(&self, ptr: *const u8) -> u32 { + let delta = (ptr as usize).wrapping_sub(self.base as usize); + (delta >> self.config.shift()) as u32 + } + + #[inline(always)] + pub fn ptr_from_offset(&self, offset: usize) -> *mut u8 { + unsafe { self.base.add(offset << self.config.shift()) } + } + + #[inline(always)] + fn owner_at_offset(&self, offset: u32) -> &AtomicU32 { + assert!(offset < self.page_count); + unsafe { &*self.owners.add(offset as usize) } + } +} + +impl Drop for Arena { + fn drop(&mut self) { + let owner_bytes = self.page_count as usize * size_of::(); + let free_span_bytes = MAX_FREE_SPANS * size_of::(); + unsafe { + let _ = platform::munmap(self.free_spans as *mut u8, free_span_bytes); + let _ = platform::munmap(self.owners as *mut u8, owner_bytes); + let _ = platform::munmap(self.base, self.arena_size); + } + let _ = platform::close(self.fd); + } +} + +#[inline(always)] +fn align_up_u32(value: u32, alignment: u32) -> u32 { + debug_assert!(alignment.is_power_of_two()); + (value + alignment - 1) & !(alignment - 1) +} + +impl Arena { + fn take_free_span(&self, page_count: u32, alignment: u32) -> Option { + let _guard = self.free_span_lock.lock(); + let count = unsafe { &mut *self.free_span_count.get() }; + let mut index = 0usize; + while index < *count as usize { + let span = unsafe { *self.free_spans.add(index) }; + let aligned = align_up_u32(span.offset, alignment); + let prefix = aligned.checked_sub(span.offset)?; + let total = prefix.checked_add(page_count)?; + if total <= span.length { + self.remove_free_span_at(index, count); + + if prefix > 0 { + self.push_free_span(Span::new(span.offset, prefix), count); + } + + let suffix_offset = aligned + page_count; + let suffix_length = span.length - total; + if suffix_length > 0 { + self.push_free_span(Span::new(suffix_offset, suffix_length), count); + } + + return Some(Span::new(aligned, page_count)); + } + index += 1; + } + + None + } + + fn push_free_span(&self, span: Span, count: &mut u32) { + assert!((*count as usize) < MAX_FREE_SPANS); + unsafe { + self.free_spans.add(*count as usize).write(span); + } + *count += 1; + } + + fn remove_free_span_at(&self, index: usize, count: &mut u32) { + debug_assert!(index < *count as usize); + let last = *count as usize - 1; + if index != last { + let replacement = unsafe { *self.free_spans.add(last) }; + unsafe { + self.free_spans.add(index).write(replacement); + } + } + *count -= 1; + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/bitmap.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/bitmap.rs new file mode 100644 index 0000000..2ebb34e --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/bitmap.rs @@ -0,0 +1,236 @@ +use core::sync::atomic::{AtomicUsize, Ordering}; + +use super::constants::MAX_OBJECT_SLOTS_PER_SPAN; + +const USIZE_BITS: usize = usize::BITS as usize; +const BITMAP_WORDS: usize = MAX_OBJECT_SLOTS_PER_SPAN / USIZE_BITS; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct RelaxedBitmap { + bit_count: u16, + words: [usize; BITMAP_WORDS], +} + +impl RelaxedBitmap { + #[inline] + pub fn new(bit_count: usize) -> Self { + assert!(bit_count <= MAX_OBJECT_SLOTS_PER_SPAN); + Self { + bit_count: bit_count as u16, + words: [0; BITMAP_WORDS], + } + } + + #[inline(always)] + pub const fn bit_count(&self) -> usize { + self.bit_count as usize + } + + #[inline(always)] + pub fn words(&self) -> &[usize; BITMAP_WORDS] { + &self.words + } + + #[inline(always)] + pub fn words_mut(&mut self) -> &mut [usize; BITMAP_WORDS] { + &mut self.words + } + + #[inline] + pub fn clear(&mut self) { + self.words = [0; BITMAP_WORDS]; + } + + #[inline] + pub fn set_all(&mut self) { + self.words = [usize::MAX; BITMAP_WORDS]; + self.mask_unused_bits(); + } + + #[inline] + pub fn invert_masked(&mut self) { + for word in &mut self.words { + *word = !*word; + } + self.mask_unused_bits(); + } + + #[inline(always)] + pub fn try_set(&mut self, index: usize) -> bool { + let (word, mask) = word_and_mask(index); + let old = self.words[word]; + self.words[word] = old | mask; + old & mask == 0 + } + + #[inline(always)] + pub fn unset(&mut self, index: usize) -> bool { + let (word, mask) = word_and_mask(index); + let old = self.words[word]; + self.words[word] = old & !mask; + old & mask != 0 + } + + #[inline(always)] + pub fn is_set(&self, index: usize) -> bool { + let (word, mask) = word_and_mask(index); + self.words[word] & mask != 0 + } + + #[inline] + pub fn in_use_count(&self) -> u32 { + self.words.iter().map(|word| word.count_ones()).sum() + } + + #[inline] + pub fn iter_set_bits(&self) -> BitIter { + BitIter::new(self.words, self.bit_count()) + } + + #[inline] + fn mask_unused_bits(&mut self) { + let valid_bits = self.bit_count(); + if valid_bits == MAX_OBJECT_SLOTS_PER_SPAN { + return; + } + + let used_words = valid_bits / USIZE_BITS; + let remainder = valid_bits % USIZE_BITS; + for word in self + .words + .iter_mut() + .skip(used_words + usize::from(remainder != 0)) + { + *word = 0; + } + if remainder != 0 { + self.words[used_words] &= (1usize << remainder) - 1; + } + } +} + +#[derive(Debug)] +pub struct AtomicBitmap { + bit_count: u16, + words: [AtomicUsize; BITMAP_WORDS], +} + +impl AtomicBitmap { + #[inline] + pub fn new(bit_count: usize) -> Self { + assert!(bit_count <= MAX_OBJECT_SLOTS_PER_SPAN); + Self { + bit_count: bit_count as u16, + words: [const { AtomicUsize::new(0) }; BITMAP_WORDS], + } + } + + #[inline(always)] + pub const fn bit_count(&self) -> usize { + self.bit_count as usize + } + + #[inline(always)] + pub fn try_set(&self, index: usize) -> bool { + let (word, mask) = word_and_mask(index); + let old = self.words[word].fetch_or(mask, Ordering::AcqRel); + old & mask == 0 + } + + #[inline(always)] + pub fn unset(&self, index: usize) -> bool { + let (word, mask) = word_and_mask(index); + let old = self.words[word].fetch_and(!mask, Ordering::AcqRel); + old & mask != 0 + } + + #[inline(always)] + pub fn is_set(&self, index: usize) -> bool { + let (word, mask) = word_and_mask(index); + self.words[word].load(Ordering::Acquire) & mask != 0 + } + + #[inline] + pub fn in_use_count(&self) -> u32 { + self.words + .iter() + .map(|word| word.load(Ordering::Acquire).count_ones()) + .sum() + } + + #[inline] + pub fn swap_words(&self, new_words: &[usize; BITMAP_WORDS]) -> [usize; BITMAP_WORDS] { + let mut old_words = [0usize; BITMAP_WORDS]; + let mut i = 0; + while i < BITMAP_WORDS { + old_words[i] = self.words[i].swap(new_words[i], Ordering::AcqRel); + i += 1; + } + old_words + } + + #[inline] + pub fn snapshot(&self) -> RelaxedBitmap { + let mut bitmap = RelaxedBitmap::new(self.bit_count()); + let mut i = 0; + while i < BITMAP_WORDS { + bitmap.words_mut()[i] = self.words[i].load(Ordering::Acquire); + i += 1; + } + bitmap + } + + #[inline] + pub fn take_free_bits(&self) -> RelaxedBitmap { + let mut all_used = RelaxedBitmap::new(self.bit_count()); + all_used.set_all(); + let previous = self.swap_words(all_used.words()); + + let mut free_bits = RelaxedBitmap::new(self.bit_count()); + *free_bits.words_mut() = previous; + free_bits.invert_masked(); + free_bits + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct BitIter { + words: [usize; BITMAP_WORDS], + bit_count: usize, + next_index: usize, +} + +impl BitIter { + #[inline(always)] + pub fn new(words: [usize; BITMAP_WORDS], bit_count: usize) -> Self { + Self { + words, + bit_count, + next_index: 0, + } + } +} + +impl Iterator for BitIter { + type Item = usize; + + fn next(&mut self) -> Option { + while self.next_index < self.bit_count { + let current = self.next_index; + self.next_index += 1; + let (word, mask) = word_and_mask(current); + if self.words[word] & mask != 0 { + return Some(current); + } + } + None + } +} + +#[inline(always)] +fn word_and_mask(index: usize) -> (usize, usize) { + debug_assert!(index < MAX_OBJECT_SLOTS_PER_SPAN); + let word = index / USIZE_BITS; + let bit = index % USIZE_BITS; + (word, 1usize << bit) +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/constants.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/constants.rs new file mode 100644 index 0000000..9df6038 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/constants.rs @@ -0,0 +1,19 @@ +pub const MIN_SUPPORTED_PAGE_SIZE: usize = 4096; +pub const MAX_SUPPORTED_PAGE_SIZE: usize = 16384; +pub const MIN_OBJECT_SIZE: usize = 16; +pub const MAX_SMALL_ALLOCATION: usize = 16_384; +pub const NUM_SIZE_CLASSES: usize = 25; +pub const OCCUPANCY_CUTOFF_NUMERATOR: u32 = 4; +pub const OCCUPANCY_CUTOFF_DENOMINATOR: u32 = 5; +pub const MIN_SHUFFLE_VECTOR_LENGTH: usize = 8; +pub const MAX_ATTACHED_MINIHEAPS_PER_CLASS: usize = 48; +pub const MAX_OBJECT_SLOTS_PER_SPAN: usize = MAX_SUPPORTED_PAGE_SIZE / MIN_OBJECT_SIZE; +pub const MAX_SHUFFLE_VECTOR_LENGTH: usize = MAX_OBJECT_SLOTS_PER_SPAN; +pub const DEFAULT_ARENA_SIZE: usize = 64 * 1024 * 1024 * 1024; +pub const MINIHEAP_REFILL_GOAL_SIZE: usize = 16 * 1024; + +#[inline(always)] +pub const fn is_below_partial_threshold(in_use: u32, max_count: u32) -> bool { + in_use.saturating_mul(OCCUPANCY_CUTOFF_DENOMINATOR) + < max_count.saturating_mul(OCCUPANCY_CUTOFF_NUMERATOR) +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/fault.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/fault.rs new file mode 100644 index 0000000..ec5290e --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/fault.rs @@ -0,0 +1,132 @@ +use core::sync::atomic::{AtomicU32, AtomicUsize, Ordering}; + +use super::raw_sys; +use super::sync::{futex_wait_for_value, futex_wake_all}; + +const INSTALL_UNINITIALIZED: u32 = 0; +const INSTALL_READY: u32 = 1; +const INSTALL_FAILED: u32 = 2; + +static INSTALL_STATE: AtomicU32 = AtomicU32::new(INSTALL_UNINITIALIZED); +static ACTIVE_MESH_SEQ: AtomicU32 = AtomicU32::new(0); +static ACTIVE_MESH_START: AtomicUsize = AtomicUsize::new(0); +static ACTIVE_MESH_LEN: AtomicUsize = AtomicUsize::new(0); + +#[derive(Debug)] +pub struct ActiveMeshGuard { + active: bool, +} + +impl ActiveMeshGuard { + pub fn begin(start: *mut u8, len: usize) -> raw_sys::Result { + ensure_fault_mediation_installed()?; + loop { + let seq = ACTIVE_MESH_SEQ.load(Ordering::Acquire); + if seq & 1 == 0 { + break; + } + futex_wait_for_value(&ACTIVE_MESH_SEQ, seq); + } + ACTIVE_MESH_START.store(start as usize, Ordering::Release); + ACTIVE_MESH_LEN.store(len, Ordering::Release); + let previous = ACTIVE_MESH_SEQ.fetch_add(1, Ordering::AcqRel); + debug_assert_eq!(previous & 1, 0); + Ok(Self { active: true }) + } + + pub fn finish(mut self) { + self.release(); + } + + fn release(&mut self) { + if !self.active { + return; + } + + ACTIVE_MESH_START.store(0, Ordering::Release); + ACTIVE_MESH_LEN.store(0, Ordering::Release); + let previous = ACTIVE_MESH_SEQ.fetch_add(1, Ordering::AcqRel); + debug_assert_eq!(previous & 1, 1); + futex_wake_all(&ACTIVE_MESH_SEQ); + self.active = false; + } +} + +impl Drop for ActiveMeshGuard { + fn drop(&mut self) { + self.release(); + } +} + +pub fn ensure_fault_mediation_installed() -> raw_sys::Result<()> { + match INSTALL_STATE.compare_exchange( + INSTALL_UNINITIALIZED, + INSTALL_READY, + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) | Err(INSTALL_READY) => Ok(()), + Err(_) => { + INSTALL_STATE.store(INSTALL_FAILED, Ordering::Release); + Err(raw_sys::Error(raw_sys::EAGAIN)) + } + } +} + +pub fn ok_to_proceed(ptr: *const u8) -> bool { + let address = ptr as usize; + let mut waited = false; + loop { + let seq = ACTIVE_MESH_SEQ.load(Ordering::Acquire); + if seq & 1 == 0 { + return waited; + } + + let start = ACTIVE_MESH_START.load(Ordering::Acquire); + let len = ACTIVE_MESH_LEN.load(Ordering::Acquire); + let end = start.saturating_add(len); + if address < start || address >= end { + return waited; + } + + waited = true; + futex_wait_for_value(&ACTIVE_MESH_SEQ, seq); + } +} + +pub fn retry_on_efault(ptr: *const u8, mut op: F) -> raw_sys::Result +where + F: FnMut() -> raw_sys::Result, +{ + loop { + match op() { + Ok(value) => return Ok(value), + Err(error) if error.errno() == raw_sys::EFAULT && ok_to_proceed(ptr) => continue, + Err(error) => return Err(error), + } + } +} + +pub fn retry_on_efault_ptrs(ptrs: &[*const u8], mut op: F) -> raw_sys::Result +where + F: FnMut() -> raw_sys::Result, +{ + loop { + match op() { + Ok(value) => return Ok(value), + Err(error) if error.errno() == raw_sys::EFAULT => { + let mut waited = false; + for &ptr in ptrs { + if !ptr.is_null() { + waited |= ok_to_proceed(ptr); + } + } + if waited { + continue; + } + return Err(error); + } + Err(error) => return Err(error), + } + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/global.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/global.rs new file mode 100644 index 0000000..19bffc6 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/global.rs @@ -0,0 +1,453 @@ +use core::alloc::{GlobalAlloc, Layout}; +use core::cell::UnsafeCell; +use core::mem::MaybeUninit; +use core::ptr::{addr_of_mut, drop_in_place, null, null_mut}; +use core::sync::atomic::{AtomicU32, Ordering}; + +use super::allocator::MeshAllocator; +use super::constants::DEFAULT_ARENA_SIZE; +use super::stats::{ + CompactionAdvice, CompactionSkipReason, MeshStats, RuntimeCompactionPolicy, + RuntimeCompactionResult, +}; +use super::sync::{FutexMutex, futex_wait_for_value, futex_wake_all}; +use super::thread_local_heap::ThreadLocalHeap; + +const INIT_UNINITIALIZED: u32 = 0; +const INIT_IN_PROGRESS: u32 = 1; +const INIT_READY: u32 = 2; +const INIT_FAILED: u32 = 3; + +pub const DEFAULT_GLOBAL_MINIHEAP_CAPACITY: u32 = 4096; + +const TLS_UNINITIALIZED: u32 = 0; +const TLS_READY: u32 = 1; +const TLS_FAILED: u32 = 2; +const SAFEPOINT_INACTIVE: u32 = 0; +const SAFEPOINT_ACTIVE: u32 = 1; + +#[thread_local] +static mut THREAD_HEAP_STATE: u32 = TLS_UNINITIALIZED; +#[thread_local] +static mut THREAD_HEAP: MaybeUninit = MaybeUninit::uninit(); +#[thread_local] +static mut THREAD_SAFEPOINT_STATE: u32 = SAFEPOINT_INACTIVE; +#[thread_local] +static mut THREAD_HEAP_OWNER: *const GlobalMeshAllocator = null(); + +#[derive(Debug)] +pub struct GlobalMeshAllocator { + arena_size: usize, + miniheap_capacity: u32, + init_state: AtomicU32, + registered_threads: AtomicU32, + quiescent_threads: AtomicU32, + lock: FutexMutex, + allocator: UnsafeCell>, +} + +impl GlobalMeshAllocator { + pub const fn new(arena_size: usize, miniheap_capacity: u32) -> Self { + Self { + arena_size, + miniheap_capacity, + init_state: AtomicU32::new(INIT_UNINITIALIZED), + registered_threads: AtomicU32::new(0), + quiescent_threads: AtomicU32::new(0), + lock: FutexMutex::new(), + allocator: UnsafeCell::new(MaybeUninit::uninit()), + } + } + + pub const fn with_default_config() -> Self { + Self::new(DEFAULT_ARENA_SIZE, DEFAULT_GLOBAL_MINIHEAP_CAPACITY) + } + + pub fn init_thread(&self) -> bool { + self.thread_heap().is_some() + } + + pub fn shutdown_thread(&self) { + unsafe { + let state_ptr = addr_of_mut!(THREAD_HEAP_STATE); + let heap_ptr = addr_of_mut!(THREAD_HEAP); + let safepoint_ptr = addr_of_mut!(THREAD_SAFEPOINT_STATE); + let owner_ptr = addr_of_mut!(THREAD_HEAP_OWNER); + if *safepoint_ptr == SAFEPOINT_ACTIVE { + self.quiescent_threads.fetch_sub(1, Ordering::AcqRel); + *safepoint_ptr = SAFEPOINT_INACTIVE; + } + if *state_ptr == TLS_READY { + let _ = self.with_existing_allocator_mut(|allocator| { + allocator.shutdown_thread((*heap_ptr).assume_init_mut()); + }); + drop_in_place((*heap_ptr).as_mut_ptr()); + self.registered_threads.fetch_sub(1, Ordering::AcqRel); + } + *state_ptr = TLS_UNINITIALIZED; + *owner_ptr = null(); + } + } + + /// Marks the current thread quiescent for cooperative compaction. + /// + /// This is **not** true concurrent compaction: arbitrary loads/stores through existing raw + /// pointers remain outside allocator control, so compaction is only safe once all registered + /// allocator threads have voluntarily entered this state. + pub fn enter_quiescent_compaction_state(&self) -> bool { + if self.thread_heap().is_none() { + return false; + } + + unsafe { + let safepoint_ptr = addr_of_mut!(THREAD_SAFEPOINT_STATE); + if *safepoint_ptr == SAFEPOINT_ACTIVE { + return true; + } + *safepoint_ptr = SAFEPOINT_ACTIVE; + } + self.quiescent_threads.fetch_add(1, Ordering::AcqRel); + true + } + + /// Leaves the cooperative quiescent compaction state for the current thread. + pub fn leave_quiescent_compaction_state(&self) { + unsafe { + let safepoint_ptr = addr_of_mut!(THREAD_SAFEPOINT_STATE); + if *safepoint_ptr == SAFEPOINT_ACTIVE { + *safepoint_ptr = SAFEPOINT_INACTIVE; + self.quiescent_threads.fetch_sub(1, Ordering::AcqRel); + } + } + } + + /// Returns true only when every registered allocator thread is quiescent. + /// + /// This is a cooperative global-quiescence check, not a proof that active mutators can safely + /// race with remap/migration. + pub fn quiescent_compaction_ready(&self) -> bool { + let registered = self.registered_threads.load(Ordering::Acquire); + registered != 0 && registered == self.quiescent_threads.load(Ordering::Acquire) + } + + /// Runs cooperative quiescent compaction when the current thread and every other registered + /// allocator thread have voluntarily stopped allocator-visible activity. + /// + /// This API intentionally does **not** claim to provide concurrent compaction with active + /// mutators. Achieving that would require heavier machinery such as page-fault mediation, + /// signal handling, syscall retry/interposition, or equivalent runtime coordination for raw + /// pointer accesses and kernel I/O into moving pages. + pub fn compact_when_quiescent( + &self, + policy: RuntimeCompactionPolicy, + ) -> RuntimeCompactionResult { + if !self.current_thread_in_safepoint() { + return RuntimeCompactionResult::Skipped { + reason: CompactionSkipReason::NotAtSafepoint, + advice: self.compaction_advice(), + }; + } + if !self.quiescent_compaction_ready() { + return RuntimeCompactionResult::Skipped { + reason: CompactionSkipReason::ThreadsActive, + advice: self.compaction_advice(), + }; + } + + let Some(thread_heap) = self.thread_heap() else { + return RuntimeCompactionResult::Skipped { + reason: CompactionSkipReason::ThreadUnavailable, + advice: self.compaction_advice(), + }; + }; + let Some(result) = self.with_existing_allocator_mut(|allocator| { + allocator.shutdown_thread(thread_heap); + let advice = allocator.stats().compaction_advice(); + if !policy.should_compact(&advice) { + return RuntimeCompactionResult::Skipped { + reason: CompactionSkipReason::Policy, + advice: Some(advice), + }; + } + + let meshes = allocator.compact_with_thread(thread_heap); + RuntimeCompactionResult::Compacted { meshes, advice } + }) else { + return RuntimeCompactionResult::Skipped { + reason: CompactionSkipReason::AllocatorUnavailable, + advice: None, + }; + }; + + result + } + + /// Compatibility alias for `enter_quiescent_compaction_state`. + pub fn enter_compaction_safepoint(&self) -> bool { + self.enter_quiescent_compaction_state() + } + + /// Compatibility alias for `leave_quiescent_compaction_state`. + pub fn leave_compaction_safepoint(&self) { + self.leave_quiescent_compaction_state(); + } + + /// Compatibility alias for `quiescent_compaction_ready`. + pub fn compaction_safepoint_ready(&self) -> bool { + self.quiescent_compaction_ready() + } + + /// Compatibility alias for `compact_when_quiescent`. + pub fn compact_at_safepoint(&self, policy: RuntimeCompactionPolicy) -> RuntimeCompactionResult { + self.compact_when_quiescent(policy) + } + + pub fn compact(&self) -> usize { + let Some(thread_heap) = self.thread_heap() else { + return 0; + }; + self.with_allocator(|allocator| allocator.compact_with_thread(thread_heap)) + .unwrap_or_default() + } + + pub fn stats(&self) -> Option { + self.with_existing_allocator(|allocator| allocator.stats()) + } + + pub fn compaction_advice(&self) -> Option { + self.stats().map(|stats| stats.compaction_advice()) + } + + fn with_allocator(&self, f: impl FnOnce(&mut MeshAllocator) -> R) -> Option { + if !self.ensure_initialized() { + return None; + } + + let _guard = self.lock.lock(); + let allocator = unsafe { (&mut *self.allocator.get()).assume_init_mut() }; + Some(f(allocator)) + } + + fn with_existing_allocator(&self, f: impl FnOnce(&MeshAllocator) -> R) -> Option { + if self.init_state.load(Ordering::Acquire) != INIT_READY { + return None; + } + + let _guard = self.lock.lock(); + if self.init_state.load(Ordering::Acquire) != INIT_READY { + return None; + } + let allocator = unsafe { (&*self.allocator.get()).assume_init_ref() }; + Some(f(allocator)) + } + + fn with_existing_allocator_mut(&self, f: impl FnOnce(&mut MeshAllocator) -> R) -> Option { + if self.init_state.load(Ordering::Acquire) != INIT_READY { + return None; + } + + let _guard = self.lock.lock(); + if self.init_state.load(Ordering::Acquire) != INIT_READY { + return None; + } + let allocator = unsafe { (&mut *self.allocator.get()).assume_init_mut() }; + Some(f(allocator)) + } + + fn ensure_initialized(&self) -> bool { + loop { + match self.init_state.load(Ordering::Acquire) { + INIT_READY => return true, + INIT_FAILED => return false, + INIT_UNINITIALIZED => { + if self + .init_state + .compare_exchange( + INIT_UNINITIALIZED, + INIT_IN_PROGRESS, + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_ok() + { + let result = MeshAllocator::new(self.arena_size, self.miniheap_capacity); + match result { + Ok(allocator) => unsafe { + (*self.allocator.get()).write(allocator); + self.init_state.store(INIT_READY, Ordering::Release); + futex_wake_all(&self.init_state); + return true; + }, + Err(_) => { + self.init_state.store(INIT_FAILED, Ordering::Release); + futex_wake_all(&self.init_state); + return false; + } + } + } + } + INIT_IN_PROGRESS => futex_wait_for_value(&self.init_state, INIT_IN_PROGRESS), + _ => return false, + } + } + } + + fn thread_heap(&self) -> Option<&'static mut ThreadLocalHeap> { + unsafe { + let owner_ptr = addr_of_mut!(THREAD_HEAP_OWNER); + let heap_ptr = addr_of_mut!(THREAD_HEAP); + match THREAD_HEAP_STATE { + TLS_READY if core::ptr::eq(*owner_ptr, self) => Some((*heap_ptr).assume_init_mut()), + TLS_READY => { + self.reset_foreign_thread_heap(); + self.thread_heap() + } + TLS_FAILED => None, + TLS_UNINITIALIZED => match ThreadLocalHeap::new() { + Ok(heap) => { + (*heap_ptr).write(heap); + *addr_of_mut!(THREAD_SAFEPOINT_STATE) = SAFEPOINT_INACTIVE; + *owner_ptr = self as *const _; + self.registered_threads.fetch_add(1, Ordering::AcqRel); + THREAD_HEAP_STATE = TLS_READY; + Some((*heap_ptr).assume_init_mut()) + } + Err(_) => { + THREAD_HEAP_STATE = TLS_FAILED; + None + } + }, + _ => None, + } + } + } + + fn current_thread_in_safepoint(&self) -> bool { + let _ = self; + unsafe { + core::ptr::eq(THREAD_HEAP_OWNER, self) && THREAD_SAFEPOINT_STATE == SAFEPOINT_ACTIVE + } + } + + fn reset_foreign_thread_heap(&self) { + let _ = self; + unsafe { + if THREAD_HEAP_STATE == TLS_READY { + drop_in_place(addr_of_mut!(THREAD_HEAP).cast::()); + } + THREAD_HEAP_STATE = TLS_UNINITIALIZED; + THREAD_SAFEPOINT_STATE = SAFEPOINT_INACTIVE; + THREAD_HEAP_OWNER = null(); + } + } +} + +impl Drop for GlobalMeshAllocator { + fn drop(&mut self) { + unsafe { + if core::ptr::eq(THREAD_HEAP_OWNER, self) { + self.shutdown_thread(); + } + } + + if self.init_state.load(Ordering::Acquire) == INIT_READY { + let _guard = self.lock.lock(); + if self.init_state.load(Ordering::Acquire) == INIT_READY { + unsafe { + drop_in_place((&mut *self.allocator.get()).as_mut_ptr()); + } + self.init_state.store(INIT_FAILED, Ordering::Release); + } + } + } +} + +unsafe impl Sync for GlobalMeshAllocator {} +unsafe impl Send for GlobalMeshAllocator {} + +unsafe impl GlobalAlloc for GlobalMeshAllocator { + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + if !self.ensure_initialized() { + return null_mut(); + } + let Some(thread_heap) = self.thread_heap() else { + return null_mut(); + }; + + let allocator_ref = unsafe { (&*self.allocator.get()).assume_init_ref() }; + if let Some(class) = size_class_for_layout(layout) + && let Some(ptr) = allocator_ref.try_allocate_small_local(thread_heap, class) + { + return ptr; + } + + self.with_allocator(|allocator| { + allocator + .allocate_layout_with_thread(thread_heap, layout) + .unwrap_or(null_mut()) + }) + .unwrap_or(null_mut()) + } + + unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { + if !self.ensure_initialized() { + return; + } + let Some(thread_heap) = self.thread_heap() else { + return; + }; + + let allocator_ref = unsafe { (&*self.allocator.get()).assume_init_ref() }; + if allocator_ref.try_deallocate_local(ptr, thread_heap) { + return; + } + + let _ = self.with_allocator(|allocator| allocator.deallocate_with_thread(ptr, thread_heap)); + } + + unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 { + let ptr = unsafe { self.alloc(layout) }; + if !ptr.is_null() { + unsafe { + ptr.write_bytes(0, layout.size()); + } + } + ptr + } + + unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 { + if !self.ensure_initialized() { + return null_mut(); + } + let Some(thread_heap) = self.thread_heap() else { + return null_mut(); + }; + + self.with_allocator(|allocator| unsafe { + allocator + .reallocate_with_thread(ptr, layout, new_size, thread_heap) + .unwrap_or(null_mut()) + }) + .unwrap_or(null_mut()) + } +} + +fn size_class_for_layout(layout: Layout) -> Option { + if layout.align() > super::page::page_size() { + return None; + } + let aligned_size = layout + .size() + .max(1) + .checked_add(layout.align() - 1) + .map(|value| value & !(layout.align() - 1))?; + if aligned_size > super::constants::MAX_SMALL_ALLOCATION { + return None; + } + let class = super::size_map::size_class_for(aligned_size)?; + if super::size_map::byte_size_for_class(class).is_multiple_of(layout.align()) { + Some(class) + } else { + None + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/meshing.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/meshing.rs new file mode 100644 index 0000000..fcf083a --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/meshing.rs @@ -0,0 +1,15 @@ +use super::bitmap::AtomicBitmap; + +#[inline] +pub fn bitmaps_meshable(left: &AtomicBitmap, right: &AtomicBitmap) -> bool { + let left_words = left.snapshot(); + let right_words = right.snapshot(); + + for (lhs, rhs) in left_words.words().iter().zip(right_words.words().iter()) { + if lhs & rhs != 0 { + return false; + } + } + + true +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/miniheap.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/miniheap.rs new file mode 100644 index 0000000..9b90dd4 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/miniheap.rs @@ -0,0 +1,374 @@ +use core::sync::atomic::{AtomicU32, Ordering}; + +use super::bitmap::AtomicBitmap; +use super::page::page_size; +use super::size_map::{byte_size_for_class, size_class_for}; +use super::span::Span; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[repr(u8)] +pub enum FreelistId { + Full = 0, + Partial = 1, + Empty = 2, + Attached = 3, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct MiniHeapId(u32); + +impl MiniHeapId { + #[inline(always)] + pub const fn new(id: u32) -> Self { + Self(id) + } + + #[inline(always)] + pub const fn value(self) -> u32 { + self.0 + } + + #[inline(always)] + pub const fn has_value(self) -> bool { + self.0 != 0 + } +} + +#[derive(Debug)] +pub struct MiniHeapFlags { + bits: AtomicU32, +} + +impl MiniHeapFlags { + const SIZE_CLASS_SHIFT: u32 = 0; + const FREELIST_ID_SHIFT: u32 = 6; + const SHUFFLE_OFFSET_SHIFT: u32 = 8; + const MAX_COUNT_SHIFT: u32 = 16; + const PENDING_OFFSET: u32 = 27; + const MESHED_OFFSET: u32 = 30; + + #[inline] + pub fn new( + max_count: u16, + size_class: u8, + shuffle_offset: u8, + freelist_id: FreelistId, + ) -> Self { + let bits = ((max_count as u32) << Self::MAX_COUNT_SHIFT) + | ((shuffle_offset as u32) << Self::SHUFFLE_OFFSET_SHIFT) + | ((freelist_id as u32) << Self::FREELIST_ID_SHIFT) + | ((size_class as u32) << Self::SIZE_CLASS_SHIFT); + Self { + bits: AtomicU32::new(bits), + } + } + + #[inline(always)] + fn load(&self) -> u32 { + self.bits.load(Ordering::Acquire) + } + + #[inline(always)] + fn update_masked(&self, mask: u32, value: u32) { + let mut old = self.bits.load(Ordering::Relaxed); + loop { + let new = (old & mask) | value; + match self + .bits + .compare_exchange_weak(old, new, Ordering::AcqRel, Ordering::Relaxed) + { + Ok(_) => return, + Err(next) => old = next, + } + } + } + + #[inline(always)] + pub fn max_count(&self) -> u16 { + ((self.load() >> Self::MAX_COUNT_SHIFT) & 0x7ff) as u16 + } + + #[inline(always)] + pub fn size_class(&self) -> u8 { + ((self.load() >> Self::SIZE_CLASS_SHIFT) & 0x3f) as u8 + } + + #[inline(always)] + pub fn freelist_id(&self) -> FreelistId { + match (self.load() >> Self::FREELIST_ID_SHIFT) & 0x3 { + 0 => FreelistId::Full, + 1 => FreelistId::Partial, + 2 => FreelistId::Empty, + _ => FreelistId::Attached, + } + } + + #[inline(always)] + pub fn set_freelist_id(&self, id: FreelistId) { + let mask = !(0x3 << Self::FREELIST_ID_SHIFT); + self.update_masked(mask, (id as u32) << Self::FREELIST_ID_SHIFT); + } + + #[inline(always)] + pub fn shuffle_vector_offset(&self) -> u8 { + ((self.load() >> Self::SHUFFLE_OFFSET_SHIFT) & 0xff) as u8 + } + + #[inline(always)] + pub fn set_shuffle_vector_offset(&self, offset: u8) { + let mask = !(0xff << Self::SHUFFLE_OFFSET_SHIFT); + self.update_masked(mask, (offset as u32) << Self::SHUFFLE_OFFSET_SHIFT); + } + + #[inline(always)] + pub fn is_pending(&self) -> bool { + self.load() & (1 << Self::PENDING_OFFSET) != 0 + } + + #[inline(always)] + pub fn clear_pending(&self) { + self.bits + .fetch_and(!(1 << Self::PENDING_OFFSET), Ordering::AcqRel); + } + + #[inline(always)] + pub fn try_set_pending_from_full(&self) -> bool { + let full = (FreelistId::Full as u32) << Self::FREELIST_ID_SHIFT; + let pending = 1 << Self::PENDING_OFFSET; + let freelist_mask = 0x3 << Self::FREELIST_ID_SHIFT; + + let mut old = self.bits.load(Ordering::Relaxed); + loop { + if (old & freelist_mask) != full || (old & pending) != 0 { + return false; + } + let new = old | pending; + match self + .bits + .compare_exchange_weak(old, new, Ordering::AcqRel, Ordering::Relaxed) + { + Ok(_) => return true, + Err(next) => old = next, + } + } + } + + #[inline(always)] + pub fn is_meshed(&self) -> bool { + self.load() & (1 << Self::MESHED_OFFSET) != 0 + } + + #[inline(always)] + pub fn set_meshed(&self) { + self.bits + .fetch_or(1 << Self::MESHED_OFFSET, Ordering::AcqRel); + } +} + +#[derive(Debug)] +pub struct MiniHeap { + span: Span, + current_thread: AtomicU32, + flags: MiniHeapFlags, + next_meshed: AtomicU32, + pending_next: AtomicU32, + bitmap: AtomicBitmap, +} + +impl MiniHeap { + #[inline] + pub fn new(span: Span, object_count: u16, object_size: usize) -> Self { + let size_class = if object_count > 1 { + size_class_for(object_size).unwrap_or(1) + } else { + 1 + }; + + Self { + span, + current_thread: AtomicU32::new(0), + flags: MiniHeapFlags::new(object_count, size_class, 0, FreelistId::Attached), + next_meshed: AtomicU32::new(0), + pending_next: AtomicU32::new(0), + bitmap: AtomicBitmap::new(object_count as usize), + } + } + + #[inline(always)] + pub const fn span(&self) -> Span { + self.span + } + + #[inline(always)] + pub fn flags(&self) -> &MiniHeapFlags { + &self.flags + } + + #[inline(always)] + pub fn bitmap(&self) -> &AtomicBitmap { + &self.bitmap + } + + #[inline(always)] + pub fn max_count(&self) -> u16 { + self.flags.max_count() + } + + #[inline(always)] + pub fn size_class(&self) -> u8 { + self.flags.size_class() + } + + #[inline(always)] + pub fn is_large_alloc(&self) -> bool { + self.max_count() == 1 + } + + #[inline(always)] + pub fn object_size(&self) -> usize { + if self.is_large_alloc() { + self.span.byte_length_for_page_size(page_size()) + } else { + byte_size_for_class(self.size_class()) + } + } + + #[inline(always)] + pub fn span_size(&self) -> usize { + self.span.byte_length_for_page_size(page_size()) + } + + #[inline(always)] + pub fn in_use_count(&self) -> u32 { + self.bitmap.in_use_count() + } + + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.in_use_count() == 0 + } + + #[inline(always)] + pub fn is_full(&self) -> bool { + self.in_use_count() == self.max_count() as u32 + } + + #[inline(always)] + pub fn bytes_free(&self) -> usize { + (self.max_count() as usize - self.in_use_count() as usize) * self.object_size() + } + + #[inline(always)] + pub fn current_thread(&self) -> u32 { + self.current_thread.load(Ordering::Acquire) + } + + #[inline(always)] + pub fn set_attached(&self, thread_id: u32) { + self.current_thread.store(thread_id, Ordering::Release); + self.flags.set_freelist_id(FreelistId::Attached); + } + + #[inline(always)] + pub fn unset_attached(&self) { + self.current_thread.store(0, Ordering::Release); + } + + #[inline(always)] + pub fn is_attached(&self) -> bool { + self.current_thread() != 0 + } + + #[inline(always)] + pub fn set_shuffle_vector_offset(&self, offset: u8) { + self.flags.set_shuffle_vector_offset(offset); + } + + #[inline(always)] + pub fn shuffle_vector_offset(&self) -> u8 { + self.flags.shuffle_vector_offset() + } + + #[inline(always)] + pub fn set_pending_next(&self, next: MiniHeapId) { + self.pending_next.store(next.value(), Ordering::Release); + } + + #[inline(always)] + pub fn pending_next(&self) -> MiniHeapId { + MiniHeapId::new(self.pending_next.load(Ordering::Acquire)) + } + + #[inline(always)] + pub fn track_meshed_span(&self, next: MiniHeapId) { + self.next_meshed.store(next.value(), Ordering::Release); + } + + #[inline(always)] + pub fn next_meshed(&self) -> MiniHeapId { + MiniHeapId::new(self.next_meshed.load(Ordering::Acquire)) + } + + #[inline(always)] + pub fn has_meshed_partner(&self) -> bool { + self.next_meshed().has_value() + } + + #[inline(always)] + pub fn set_meshed(&self) { + self.flags.set_meshed(); + } + + #[inline(always)] + pub fn is_meshed(&self) -> bool { + self.flags.is_meshed() + } + + #[inline(always)] + pub fn is_meshing_candidate(&self) -> bool { + !self.is_attached() && self.object_size() < page_size() + } + + #[inline(always)] + pub fn fullness(&self) -> f32 { + self.in_use_count() as f32 / self.max_count() as f32 + } + + #[inline(always)] + pub fn malloc_at(&self, arena_begin: usize, slot: usize) -> Option<*mut u8> { + if !self.bitmap.try_set(slot) { + return None; + } + Some(self.ptr_from_offset(arena_begin, slot)) + } + + #[inline(always)] + pub fn ptr_from_offset(&self, arena_begin: usize, slot: usize) -> *mut u8 { + let span_start = arena_begin + (self.span.offset as usize * page_size()); + (span_start + slot * self.object_size()) as *mut u8 + } + + #[inline(always)] + pub fn contains_ptr(&self, arena_begin: usize, ptr: *const u8) -> bool { + let span_start = arena_begin + (self.span.offset as usize * page_size()); + let span_end = span_start + self.span_size(); + let ptr = ptr as usize; + span_start <= ptr && ptr < span_end + } + + #[inline(always)] + pub fn free_offset(&self, slot: usize) -> bool { + self.bitmap.unset(slot) + } + + #[inline(always)] + pub fn clear_if_not_free(&self, slot: usize) -> bool { + self.bitmap.unset(slot) + } + + #[inline(always)] + pub fn slot_for_ptr(&self, arena_begin: usize, ptr: *const u8) -> usize { + let span_start = arena_begin + (self.span.offset as usize * page_size()); + ((ptr as usize) - span_start) / self.object_size() + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/mod.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/mod.rs new file mode 100644 index 0000000..10f8614 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/mod.rs @@ -0,0 +1,45 @@ +pub mod allocator; +pub mod arena; +pub mod bitmap; +pub mod constants; +pub mod fault; +pub mod global; +pub mod meshing; +pub mod miniheap; +pub mod page; +pub mod platform; +pub mod pool; +pub mod raw_sys; +pub mod rng; +pub mod shuffle; +pub mod size_map; +pub mod span; +pub mod stats; +pub mod sync; +pub mod thread_local_heap; + +pub use allocator::MeshAllocator; +pub use arena::Arena; +pub use bitmap::{AtomicBitmap, BitIter, RelaxedBitmap}; +pub use constants::*; +pub use fault::{ + ActiveMeshGuard, ensure_fault_mediation_installed, ok_to_proceed, retry_on_efault, + retry_on_efault_ptrs, +}; +pub use global::{DEFAULT_GLOBAL_MINIHEAP_CAPACITY, GlobalMeshAllocator}; +pub use meshing::bitmaps_meshable; +pub use miniheap::{FreelistId, MiniHeap, MiniHeapFlags, MiniHeapId}; +pub use page::{ + PageConfig, page_count, page_shift, page_size, round_up_to_page, runtime_slots_per_span, +}; +pub use platform::{PlatformHooks, PlatformInstallError, install_platform_hooks}; +pub use rng::{Mwc, Mwc64}; +pub use shuffle::{ShuffleEntry, ShuffleVector}; +pub use size_map::{CLASS_TO_SIZE, NUM_SIZE_CLASSES, byte_size_for_class, size_class_for}; +pub use span::Span; +pub use stats::{ + CompactionAdvice, CompactionEstimate, CompactionRecommendation, CompactionSkipReason, + MeshStats, RuntimeCompactionPolicy, RuntimeCompactionResult, +}; +pub use sync::FutexMutex; +pub use thread_local_heap::ThreadLocalHeap; diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/page.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/page.rs new file mode 100644 index 0000000..ff74ff5 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/page.rs @@ -0,0 +1,94 @@ +use core::sync::atomic::{AtomicUsize, Ordering}; + +use super::constants::{MAX_SUPPORTED_PAGE_SIZE, MIN_OBJECT_SIZE, MIN_SUPPORTED_PAGE_SIZE}; +use super::platform; +static PAGE_SIZE_CACHE: AtomicUsize = AtomicUsize::new(0); + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct PageConfig { + size: usize, + shift: u32, + slots_per_span: usize, +} + +impl PageConfig { + #[inline(always)] + pub fn get() -> Self { + let size = page_size(); + Self { + size, + shift: size.trailing_zeros(), + slots_per_span: size / MIN_OBJECT_SIZE, + } + } + + #[inline(always)] + pub const fn size(self) -> usize { + self.size + } + + #[inline(always)] + pub const fn shift(self) -> u32 { + self.shift + } + + #[inline(always)] + pub const fn slots_per_span(self) -> usize { + self.slots_per_span + } +} + +#[inline] +pub fn page_size() -> usize { + let cached = PAGE_SIZE_CACHE.load(Ordering::Acquire); + if cached != 0 { + return cached; + } + + let size = query_page_size(); + match PAGE_SIZE_CACHE.compare_exchange(0, size, Ordering::AcqRel, Ordering::Acquire) { + Ok(_) => size, + Err(existing) => existing, + } +} + +#[inline(always)] +pub fn page_shift() -> u32 { + page_size().trailing_zeros() +} + +#[inline(always)] +pub fn page_count(size: usize) -> usize { + let page = page_size(); + size.div_ceil(page) +} + +#[inline(always)] +pub fn round_up_to_page(size: usize) -> usize { + page_count(size) * page_size() +} + +#[inline(always)] +pub fn runtime_slots_per_span() -> usize { + page_size() / MIN_OBJECT_SIZE +} + +fn query_page_size() -> usize { + let size = platform::page_size(); + assert!( + size.is_power_of_two(), + "page size is not a power of two: {size}" + ); + assert!( + (MIN_SUPPORTED_PAGE_SIZE..=MAX_SUPPORTED_PAGE_SIZE).contains(&size), + "unsupported page size {size}; supported range is {}..={}", + MIN_SUPPORTED_PAGE_SIZE, + MAX_SUPPORTED_PAGE_SIZE + ); + assert_eq!( + size % MIN_OBJECT_SIZE, + 0, + "page size {size} is not MIN_OBJECT_SIZE-aligned" + ); + size +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/platform.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/platform.rs new file mode 100644 index 0000000..a0f02df --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/platform.rs @@ -0,0 +1,168 @@ +use core::sync::atomic::{AtomicBool, AtomicPtr, Ordering}; + +use super::raw_sys; + +#[derive(Clone, Copy, Debug)] +pub struct PlatformHooks { + pub page_size: fn() -> usize, + pub gettid: fn() -> raw_sys::Result, + pub getrandom: fn(&mut [u8], u32) -> raw_sys::Result, + pub memfd_create: fn(*const u8, u32) -> raw_sys::Result, + pub ftruncate: fn(i32, u64) -> raw_sys::Result<()>, + pub fallocate: fn(i32, u32, u64, u64) -> raw_sys::Result<()>, + pub close: fn(i32) -> raw_sys::Result<()>, + pub futex_wait: unsafe fn(*const u32, u32, u32) -> raw_sys::Result<()>, + pub futex_wake: unsafe fn(*const u32, u32, u32) -> raw_sys::Result, + pub mmap: unsafe fn(*mut u8, usize, u32, u32, i32, u64) -> raw_sys::Result<*mut u8>, + pub map_anonymous: unsafe fn(usize, u32) -> raw_sys::Result<*mut u8>, + pub mprotect: unsafe fn(*mut u8, usize, u32) -> raw_sys::Result<()>, + pub munmap: unsafe fn(*mut u8, usize) -> raw_sys::Result<()>, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum PlatformInstallError { + AlreadyConfigured, + AlreadyInUse, +} + +static DEFAULT_PLATFORM_HOOKS: PlatformHooks = PlatformHooks { + page_size: default_page_size, + gettid: raw_sys::gettid, + getrandom: raw_sys::getrandom, + memfd_create: raw_sys::memfd_create, + ftruncate: raw_sys::ftruncate, + fallocate: raw_sys::fallocate, + close: raw_sys::close, + futex_wait: raw_sys::futex_wait, + futex_wake: raw_sys::futex_wake, + mmap: raw_sys::mmap, + map_anonymous: raw_sys::map_anonymous, + mprotect: raw_sys::mprotect, + munmap: raw_sys::munmap, +}; + +static PLATFORM_HOOKS: AtomicPtr = + AtomicPtr::new((&DEFAULT_PLATFORM_HOOKS as *const PlatformHooks).cast_mut()); +static PLATFORM_FROZEN: AtomicBool = AtomicBool::new(false); + +pub fn install_platform_hooks(hooks: &'static PlatformHooks) -> Result<(), PlatformInstallError> { + if PLATFORM_FROZEN.load(Ordering::Acquire) { + return Err(PlatformInstallError::AlreadyInUse); + } + + let default_ptr = (&DEFAULT_PLATFORM_HOOKS as *const PlatformHooks).cast_mut(); + let hooks_ptr = (hooks as *const PlatformHooks).cast_mut(); + match PLATFORM_HOOKS.compare_exchange( + default_ptr, + hooks_ptr, + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) => Ok(()), + Err(existing) if existing == hooks_ptr => Ok(()), + Err(_) => Err(PlatformInstallError::AlreadyConfigured), + } +} + +#[inline(always)] +pub fn page_size() -> usize { + (platform_hooks().page_size)() +} + +#[inline(always)] +pub fn gettid() -> raw_sys::Result { + (platform_hooks().gettid)() +} + +#[inline(always)] +pub fn getrandom(buf: &mut [u8], flags: u32) -> raw_sys::Result { + (platform_hooks().getrandom)(buf, flags) +} + +#[inline(always)] +pub fn memfd_create(name: *const u8, flags: u32) -> raw_sys::Result { + (platform_hooks().memfd_create)(name, flags) +} + +#[inline(always)] +pub fn ftruncate(fd: i32, len: u64) -> raw_sys::Result<()> { + (platform_hooks().ftruncate)(fd, len) +} + +#[inline(always)] +pub fn fallocate(fd: i32, mode: u32, offset: u64, len: u64) -> raw_sys::Result<()> { + (platform_hooks().fallocate)(fd, mode, offset, len) +} + +#[inline(always)] +pub fn close(fd: i32) -> raw_sys::Result<()> { + (platform_hooks().close)(fd) +} + +#[inline(always)] +/// # Safety +/// +/// `uaddr` must be valid for the kernel to access as a futex word for the duration of the call. +pub unsafe fn futex_wait(uaddr: *const u32, op: u32, expected: u32) -> raw_sys::Result<()> { + unsafe { (platform_hooks().futex_wait)(uaddr, op, expected) } +} + +#[inline(always)] +/// # Safety +/// +/// `uaddr` must be valid for the kernel to access as a futex word for the duration of the call. +pub unsafe fn futex_wake(uaddr: *const u32, op: u32, count: u32) -> raw_sys::Result { + unsafe { (platform_hooks().futex_wake)(uaddr, op, count) } +} + +#[inline(always)] +/// # Safety +/// +/// The caller must ensure the mapping arguments satisfy the platform `mmap(2)` contract. +pub unsafe fn mmap( + addr: *mut u8, + len: usize, + prot: u32, + flags: u32, + fd: i32, + offset: u64, +) -> raw_sys::Result<*mut u8> { + unsafe { (platform_hooks().mmap)(addr, len, prot, flags, fd, offset) } +} + +#[inline(always)] +/// # Safety +/// +/// The caller must later unmap the returned memory exactly once. +pub unsafe fn map_anonymous(len: usize, prot: u32) -> raw_sys::Result<*mut u8> { + unsafe { (platform_hooks().map_anonymous)(len, prot) } +} + +#[inline(always)] +/// # Safety +/// +/// `addr..addr+len` must refer to a valid mapped region. +pub unsafe fn mprotect(addr: *mut u8, len: usize, prot: u32) -> raw_sys::Result<()> { + unsafe { (platform_hooks().mprotect)(addr, len, prot) } +} + +#[inline(always)] +/// # Safety +/// +/// `addr..addr+len` must refer to a valid mapping that may be unmapped exactly once. +pub unsafe fn munmap(addr: *mut u8, len: usize) -> raw_sys::Result<()> { + unsafe { (platform_hooks().munmap)(addr, len) } +} + +#[inline(always)] +fn platform_hooks() -> &'static PlatformHooks { + PLATFORM_FROZEN.store(true, Ordering::Release); + let hooks = PLATFORM_HOOKS.load(Ordering::Acquire); + unsafe { &*hooks } +} + +fn default_page_size() -> usize { + let size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + assert!(size > 0, "sysconf(_SC_PAGESIZE) failed"); + size as usize +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/pool.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/pool.rs new file mode 100644 index 0000000..39d95d7 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/pool.rs @@ -0,0 +1,152 @@ +use core::mem::size_of; +use core::sync::atomic::{AtomicU8, AtomicU32, Ordering}; + +use super::miniheap::{MiniHeap, MiniHeapId}; +use super::platform; +use super::raw_sys; +use super::span::Span; + +#[derive(Debug)] +pub struct MiniHeapPool { + base: *mut MiniHeap, + live: *mut AtomicU8, + free_ids: *mut u32, + capacity: u32, + len: AtomicU32, + live_len: AtomicU32, + free_len: u32, +} + +impl MiniHeapPool { + #[inline] + pub fn with_capacity(capacity: u32) -> raw_sys::Result { + assert!(capacity > 0); + let bytes = capacity as usize * size_of::(); + let live = unsafe { + platform::map_anonymous(capacity as usize, raw_sys::PROT_READ | raw_sys::PROT_WRITE)? + }; + let free_ids = unsafe { + platform::map_anonymous( + capacity as usize * size_of::(), + raw_sys::PROT_READ | raw_sys::PROT_WRITE, + )? as *mut u32 + }; + let base = unsafe { + platform::map_anonymous(bytes, raw_sys::PROT_READ | raw_sys::PROT_WRITE)? + as *mut MiniHeap + }; + + Ok(Self { + base, + live: live as *mut AtomicU8, + free_ids, + capacity, + len: AtomicU32::new(0), + live_len: AtomicU32::new(0), + free_len: 0, + }) + } + + #[inline(always)] + pub fn len(&self) -> u32 { + self.len.load(Ordering::Acquire) + } + + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + #[inline(always)] + pub const fn capacity(&self) -> u32 { + self.capacity + } + + #[inline(always)] + pub fn live_len(&self) -> u32 { + self.live_len.load(Ordering::Acquire) + } + + #[inline] + pub fn allocate( + &mut self, + span: Span, + object_count: u16, + object_size: usize, + ) -> Option<(MiniHeapId, &MiniHeap)> { + let (id, index) = if self.free_len > 0 { + self.free_len -= 1; + let id = unsafe { *self.free_ids.add(self.free_len as usize) }; + (MiniHeapId::new(id), (id - 1) as usize) + } else { + let len = self.len.load(Ordering::Relaxed); + if len >= self.capacity { + return None; + } + let id = len + 1; + self.len.store(id, Ordering::Release); + (MiniHeapId::new(id), (id - 1) as usize) + }; + self.live_len.fetch_add(1, Ordering::AcqRel); + + unsafe { + let ptr = self.base.add(index); + ptr.write(MiniHeap::new(span, object_count, object_size)); + (&*self.live.add(index)).store(1, Ordering::Release); + Some((id, &*ptr)) + } + } + + #[inline] + pub fn get(&self, id: MiniHeapId) -> Option<&MiniHeap> { + if !id.has_value() { + return None; + } + + let index = id.value() - 1; + if index >= self.len.load(Ordering::Acquire) { + return None; + } + if unsafe { (&*self.live.add(index as usize)).load(Ordering::Acquire) } == 0 { + return None; + } + + unsafe { Some(&*self.base.add(index as usize)) } + } + + #[inline] + pub fn release(&mut self, id: MiniHeapId) -> bool { + if !id.has_value() { + return false; + } + + let index = id.value() - 1; + if index >= self.len.load(Ordering::Acquire) { + return false; + } + if unsafe { (&*self.live.add(index as usize)).load(Ordering::Acquire) } == 0 { + return false; + } + + unsafe { + (&*self.live.add(index as usize)).store(0, Ordering::Release); + self.free_ids.add(self.free_len as usize).write(id.value()); + } + self.free_len += 1; + self.live_len.fetch_sub(1, Ordering::AcqRel); + true + } +} + +impl Drop for MiniHeapPool { + fn drop(&mut self) { + let bytes = self.capacity as usize * size_of::(); + let live_bytes = self.capacity as usize; + let free_bytes = self.capacity as usize * size_of::(); + unsafe { + let _ = platform::munmap(self.free_ids as *mut u8, free_bytes); + let _ = platform::munmap(self.live as *mut u8, live_bytes); + let _ = platform::munmap(self.base as *mut u8, bytes); + } + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/raw_sys.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/raw_sys.rs new file mode 100644 index 0000000..5756853 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/raw_sys.rs @@ -0,0 +1,218 @@ +use core::ptr::null_mut; + +pub type Result = core::result::Result; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Error(pub i32); + +impl Error { + #[inline(always)] + pub const fn errno(self) -> i32 { + self.0 + } +} + +pub const PROT_NONE: u32 = libc::PROT_NONE as u32; +pub const PROT_READ: u32 = libc::PROT_READ as u32; +pub const PROT_WRITE: u32 = libc::PROT_WRITE as u32; + +pub const MAP_SHARED: u32 = libc::MAP_SHARED as u32; +pub const MAP_PRIVATE: u32 = libc::MAP_PRIVATE as u32; +pub const MAP_FIXED: u32 = libc::MAP_FIXED as u32; +pub const MAP_ANONYMOUS: u32 = libc::MAP_ANONYMOUS as u32; + +pub const MFD_CLOEXEC: u32 = libc::MFD_CLOEXEC; + +pub const FALLOC_FL_KEEP_SIZE: u32 = libc::FALLOC_FL_KEEP_SIZE as u32; +pub const FALLOC_FL_PUNCH_HOLE: u32 = libc::FALLOC_FL_PUNCH_HOLE as u32; + +pub const FUTEX_WAIT: u32 = libc::FUTEX_WAIT as u32; +pub const FUTEX_WAKE: u32 = libc::FUTEX_WAKE as u32; +pub const FUTEX_PRIVATE_FLAG: u32 = libc::FUTEX_PRIVATE_FLAG as u32; +pub const FUTEX_WAIT_PRIVATE: u32 = FUTEX_WAIT | FUTEX_PRIVATE_FLAG; +pub const FUTEX_WAKE_PRIVATE: u32 = FUTEX_WAKE | FUTEX_PRIVATE_FLAG; + +pub const EINTR: i32 = libc::EINTR; +pub const EAGAIN: i32 = libc::EAGAIN; +pub const EFAULT: i32 = libc::EFAULT; + +#[inline(always)] +fn last_error() -> Error { + Error( + std::io::Error::last_os_error() + .raw_os_error() + .unwrap_or(libc::EINVAL), + ) +} + +#[inline(always)] +fn map_c_int(result: libc::c_int) -> Result { + if result == -1 { + Err(last_error()) + } else { + Ok(result) + } +} + +#[inline(always)] +fn map_c_long(result: libc::c_long) -> Result { + if result == -1 { + Err(last_error()) + } else { + Ok(result) + } +} + +#[inline(always)] +fn to_off_t(value: u64) -> Result { + value.try_into().map_err(|_| Error(libc::EINVAL)) +} + +#[inline(always)] +pub fn getpid() -> Result { + let pid = unsafe { libc::getpid() }; + if pid == -1 { + Err(last_error()) + } else { + Ok(pid as u32) + } +} + +#[inline(always)] +pub fn gettid() -> Result { + unsafe { map_c_long(libc::syscall(libc::SYS_gettid)).map(|value| value as u32) } +} + +#[inline(always)] +pub fn close(fd: i32) -> Result<()> { + unsafe { map_c_int(libc::close(fd)).map(|_| ()) } +} + +#[inline(always)] +pub fn memfd_create(name: *const u8, flags: u32) -> Result { + unsafe { + map_c_long(libc::syscall( + libc::SYS_memfd_create, + name.cast::(), + flags as libc::c_uint, + )) + .map(|fd| fd as i32) + } +} + +#[inline(always)] +pub fn ftruncate(fd: i32, len: u64) -> Result<()> { + let len = to_off_t(len)?; + unsafe { map_c_int(libc::ftruncate(fd, len)).map(|_| ()) } +} + +#[inline(always)] +pub fn fallocate(fd: i32, mode: u32, offset: u64, len: u64) -> Result<()> { + let offset = to_off_t(offset)?; + let len = to_off_t(len)?; + unsafe { map_c_int(libc::fallocate(fd, mode as libc::c_int, offset, len)).map(|_| ()) } +} + +#[inline(always)] +pub fn getrandom(buf: &mut [u8], flags: u32) -> Result { + let result = + unsafe { libc::getrandom(buf.as_mut_ptr().cast(), buf.len(), flags as libc::c_uint) }; + if result == -1 { + Err(last_error()) + } else { + Ok(result as usize) + } +} + +#[inline(always)] +/// # Safety +/// +/// `uaddr` must be valid for the kernel to read as a futex word for the duration of the call. +pub unsafe fn futex_wait(uaddr: *const u32, op: u32, expected: u32) -> Result<()> { + unsafe { + map_c_long(libc::syscall( + libc::SYS_futex, + uaddr, + op as libc::c_int, + expected, + null_mut::(), + 0, + 0, + )) + .map(|_| ()) + } +} + +#[inline(always)] +/// # Safety +/// +/// `uaddr` must be valid for the kernel to access as a futex word for the duration of the call. +pub unsafe fn futex_wake(uaddr: *const u32, op: u32, count: u32) -> Result { + unsafe { + map_c_long(libc::syscall( + libc::SYS_futex, + uaddr, + op as libc::c_int, + count, + null_mut::(), + 0, + 0, + )) + .map(|woken| woken as u32) + } +} + +#[inline(always)] +/// # Safety +/// +/// The caller must uphold the platform `mmap(2)` contract for the provided arguments and manage +/// any returned mapping according to Rust aliasing and lifetime rules. +pub unsafe fn mmap( + addr: *mut u8, + len: usize, + prot: u32, + flags: u32, + fd: i32, + offset: u64, +) -> Result<*mut u8> { + let offset = to_off_t(offset)?; + let result = unsafe { + libc::mmap( + addr.cast(), + len, + prot as libc::c_int, + flags as libc::c_int, + fd, + offset, + ) + }; + if result == libc::MAP_FAILED { + Err(last_error()) + } else { + Ok(result.cast()) + } +} + +#[inline(always)] +/// # Safety +/// +/// The caller must later unmap the returned memory exactly once. +pub unsafe fn map_anonymous(len: usize, prot: u32) -> Result<*mut u8> { + unsafe { mmap(null_mut(), len, prot, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0) } +} + +#[inline(always)] +/// # Safety +/// +/// `addr..addr+len` must refer to a valid mapped region. +pub unsafe fn mprotect(addr: *mut u8, len: usize, prot: u32) -> Result<()> { + unsafe { map_c_int(libc::mprotect(addr.cast(), len, prot as libc::c_int)).map(|_| ()) } +} + +#[inline(always)] +/// # Safety +/// +/// `addr..addr+len` must refer to a valid mapping that can be unmapped exactly once. +pub unsafe fn munmap(addr: *mut u8, len: usize) -> Result<()> { + unsafe { map_c_int(libc::munmap(addr.cast(), len)).map(|_| ()) } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/rng.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/rng.rs new file mode 100644 index 0000000..a51ece0 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/rng.rs @@ -0,0 +1,96 @@ +use super::platform; +use super::raw_sys; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Mwc64 { + x: u64, + c: u64, + t: u64, + value: u64, + index: u8, +} + +impl Mwc64 { + #[inline(always)] + pub const fn new(seed1: u64, seed2: u64) -> Self { + Self { + x: (seed1 << 32).wrapping_add(seed2), + c: 123_456_123_456_123_456, + t: 0, + value: 0, + index: 2, + } + } + + #[inline(always)] + pub fn from_os_seed() -> raw_sys::Result { + let mut buf = [0u8; 16]; + let mut filled = 0usize; + while filled < buf.len() { + let read = platform::getrandom(&mut buf[filled..], 0)?; + if read == 0 { + return Err(raw_sys::Error(5)); + } + filled += read; + } + + let seed1 = u64::from_ne_bytes(buf[0..8].try_into().unwrap()); + let seed2 = u64::from_ne_bytes(buf[8..16].try_into().unwrap()); + Ok(Self::new(seed1.max(1), seed2.max(1))) + } + + #[inline(always)] + fn next_block(&mut self) -> u64 { + self.t = (self.x << 58).wrapping_add(self.c); + self.c = self.x >> 6; + self.x = self.x.wrapping_add(self.t); + self.c = self.c.wrapping_add((self.x < self.t) as u64); + self.x + } + + #[inline(always)] + pub fn next_u32(&mut self) -> u32 { + if self.index == 2 { + self.value = self.next_block(); + self.index = 0; + } + + let shift = (self.index as u32) * 32; + let value = (self.value >> shift) as u32; + self.index += 1; + value + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Mwc { + inner: Mwc64, +} + +impl Mwc { + #[inline(always)] + pub const fn new(seed1: u64, seed2: u64) -> Self { + Self { + inner: Mwc64::new(seed1, seed2), + } + } + + #[inline(always)] + pub fn from_os_seed() -> raw_sys::Result { + Ok(Self { + inner: Mwc64::from_os_seed()?, + }) + } + + #[inline(always)] + pub fn next_u32(&mut self) -> u32 { + self.inner.next_u32() + } + + #[inline(always)] + pub fn in_range(&mut self, min: usize, max: usize) -> usize { + debug_assert!(min <= max); + let range = 1 + max - min; + min + ((((self.next_u32() as u64) * (range as u64)) >> 32) as usize) + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/shuffle.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/shuffle.rs new file mode 100644 index 0000000..1d298d4 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/shuffle.rs @@ -0,0 +1,204 @@ +use super::bitmap::RelaxedBitmap; +use super::constants::{ + MAX_OBJECT_SLOTS_PER_SPAN, MAX_SHUFFLE_VECTOR_LENGTH, MIN_OBJECT_SIZE, + MIN_SHUFFLE_VECTOR_LENGTH, +}; +use super::page::page_size; +use super::rng::Mwc; + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct ShuffleEntry { + pub miniheap_offset: u16, + pub slot_index: u16, +} + +impl ShuffleEntry { + pub const EMPTY: Self = Self { + miniheap_offset: u16::MAX, + slot_index: u16::MAX, + }; + + #[inline(always)] + pub const fn new(miniheap_offset: u16, slot_index: u16) -> Self { + Self { + miniheap_offset, + slot_index, + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct ShuffleVector { + entries: [ShuffleEntry; MAX_SHUFFLE_VECTOR_LENGTH], + max_count: u16, + off: u16, + prng: Mwc, +} + +impl ShuffleVector { + #[inline] + pub fn for_object_size(object_size: usize, seed1: u64, seed2: u64) -> Self { + Self::with_capacity(Self::capacity_for_object_size(object_size), seed1, seed2) + } + + #[inline] + pub fn with_capacity(max_count: usize, seed1: u64, seed2: u64) -> Self { + assert!(max_count <= MAX_SHUFFLE_VECTOR_LENGTH); + Self { + entries: [ShuffleEntry::EMPTY; MAX_SHUFFLE_VECTOR_LENGTH], + max_count: max_count as u16, + off: max_count as u16, + prng: Mwc::new(seed1.max(1), seed2.max(1)), + } + } + + #[inline(always)] + pub fn capacity_for_object_size(object_size: usize) -> usize { + let size = if object_size < MIN_OBJECT_SIZE { + MIN_OBJECT_SIZE + } else { + object_size + }; + let per_page = page_size() / size; + let with_min = if per_page < MIN_SHUFFLE_VECTOR_LENGTH { + MIN_SHUFFLE_VECTOR_LENGTH + } else { + per_page + }; + if with_min > MAX_OBJECT_SLOTS_PER_SPAN { + MAX_OBJECT_SLOTS_PER_SPAN + } else { + with_min + } + } + + #[inline(always)] + pub const fn max_count(&self) -> usize { + self.max_count as usize + } + + #[inline(always)] + pub const fn len(&self) -> usize { + self.max_count() - self.off as usize + } + + #[inline(always)] + pub const fn is_empty(&self) -> bool { + self.off == self.max_count + } + + #[inline(always)] + pub const fn is_full(&self) -> bool { + self.off == 0 + } + + #[inline(always)] + pub const fn is_exhausted(&self) -> bool { + self.off >= self.max_count + } + + #[inline(always)] + pub fn clear(&mut self) { + self.off = self.max_count; + } + + #[inline(always)] + pub fn active_entries(&self) -> &[ShuffleEntry] { + &self.entries[self.off as usize..self.max_count as usize] + } + + #[inline] + pub fn count_entries_for_offset(&self, miniheap_offset: u16) -> usize { + self.active_entries() + .iter() + .filter(|entry| entry.miniheap_offset == miniheap_offset) + .count() + } + + #[inline] + pub fn push(&mut self, entry: ShuffleEntry) { + assert!(self.off > 0); + self.off -= 1; + let inserted = self.off as usize; + self.entries[inserted] = entry; + let swap_index = self.prng.in_range(inserted, self.max_count() - 1); + self.entries.swap(inserted, swap_index); + } + + #[inline] + pub fn pop(&mut self) -> Option { + if self.is_exhausted() { + return None; + } + let idx = self.off as usize; + let value = self.entries[idx]; + self.off += 1; + Some(value) + } + + #[inline] + pub fn refill_from_bitmap( + &mut self, + miniheap_offset: u16, + bitmap: &mut RelaxedBitmap, + ) -> usize { + let mut free_bits = *bitmap; + free_bits.invert_masked(); + bitmap.set_all(); + + let mut added = 0usize; + for slot in free_bits.iter_set_bits() { + if self.is_full() { + let _ = bitmap.unset(slot); + continue; + } + + self.off -= 1; + self.entries[self.off as usize] = ShuffleEntry::new(miniheap_offset, slot as u16); + added += 1; + } + + if added > 1 { + self.shuffle_active(); + } + + added + } + + #[inline] + pub fn refill_from_heap( + &mut self, + miniheap_offset: u16, + heap: &super::miniheap::MiniHeap, + ) -> usize { + let free_bits = heap.bitmap().take_free_bits(); + let mut added = 0usize; + for slot in free_bits.iter_set_bits() { + if self.is_full() { + let _ = heap.free_offset(slot); + continue; + } + + self.off -= 1; + self.entries[self.off as usize] = ShuffleEntry::new(miniheap_offset, slot as u16); + added += 1; + } + + if added > 1 { + self.shuffle_active(); + } + + added + } + + fn shuffle_active(&mut self) { + let start = self.off as usize; + let end = self.max_count(); + let mut i = start; + while i < end { + let swap_index = self.prng.in_range(i, end - 1); + self.entries.swap(i, swap_index); + i += 1; + } + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/size_map.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/size_map.rs new file mode 100644 index 0000000..72332cc --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/size_map.rs @@ -0,0 +1,73 @@ +use super::constants::MAX_SMALL_ALLOCATION; + +pub const NUM_SIZE_CLASSES: usize = 25; + +pub const CLASS_TO_SIZE: [usize; NUM_SIZE_CLASSES] = [ + 16, 16, 32, 48, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384, 448, 512, 640, 768, 896, + 1024, 2048, 4096, 8192, 16384, +]; + +#[inline(always)] +pub const fn byte_size_for_class(class: u8) -> usize { + CLASS_TO_SIZE[class as usize] +} + +#[inline(always)] +pub const fn size_class_for(size: usize) -> Option { + if size <= 16 { + Some(1) + } else if size <= 32 { + Some(2) + } else if size <= 48 { + Some(3) + } else if size <= 64 { + Some(4) + } else if size <= 80 { + Some(5) + } else if size <= 96 { + Some(6) + } else if size <= 112 { + Some(7) + } else if size <= 128 { + Some(8) + } else if size <= 160 { + Some(9) + } else if size <= 192 { + Some(10) + } else if size <= 224 { + Some(11) + } else if size <= 256 { + Some(12) + } else if size <= 320 { + Some(13) + } else if size <= 384 { + Some(14) + } else if size <= 448 { + Some(15) + } else if size <= 512 { + Some(16) + } else if size <= 640 { + Some(17) + } else if size <= 768 { + Some(18) + } else if size <= 896 { + Some(19) + } else if size <= 1024 { + Some(20) + } else if size <= 2048 { + Some(21) + } else if size <= 4096 { + Some(22) + } else if size <= 8192 { + Some(23) + } else if size <= 16384 { + Some(24) + } else { + None + } +} + +#[inline(always)] +pub const fn is_small_allocation(size: usize) -> bool { + size <= MAX_SMALL_ALLOCATION +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/span.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/span.rs new file mode 100644 index 0000000..381dbac --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/span.rs @@ -0,0 +1,45 @@ +pub const SPAN_CLASS_COUNT: u32 = 256; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Span { + pub offset: u32, + pub length: u32, +} + +impl Span { + #[inline(always)] + pub const fn new(offset: u32, length: u32) -> Self { + Self { offset, length } + } + + #[inline(always)] + pub const fn empty(self) -> bool { + self.length == 0 + } + + #[inline(always)] + pub fn split_after(&mut self, count: u32) -> Self { + assert!(count <= self.length); + let rest = Self { + offset: self.offset + count, + length: self.length - count, + }; + self.length = count; + rest + } + + #[inline(always)] + pub const fn span_class(self) -> u32 { + let length = if self.length > SPAN_CLASS_COUNT { + SPAN_CLASS_COUNT + } else { + self.length + }; + length - 1 + } + + #[inline(always)] + pub fn byte_length_for_page_size(self, page_size: usize) -> usize { + self.length as usize * page_size + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/stats.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/stats.rs new file mode 100644 index 0000000..f72e572 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/stats.rs @@ -0,0 +1,267 @@ +use core::sync::atomic::{AtomicU64, Ordering}; + +use super::page::page_size; + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct CompactionEstimate { + pub candidate_heaps: u32, + pub candidate_pages: u32, + pub candidate_free_bytes: usize, + pub best_case_meshes: u32, + pub best_case_reclaimable_pages: u32, + pub best_case_reclaimable_bytes: usize, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct MeshStats { + pub arena_size: usize, + pub reserved_bytes: usize, + pub reusable_span_count: u32, + pub reusable_span_bytes: usize, + pub live_miniheaps: u32, + pub live_small_heaps: u32, + pub partial_small_heaps: u32, + pub full_small_heaps: u32, + pub meshed_small_heaps: u32, + pub reusable_small_heaps: u32, + pub live_large_allocations: u32, + pub live_small_bytes: usize, + pub live_large_bytes: usize, + pub retained_small_span_bytes: usize, + pub retained_large_span_bytes: usize, + pub virtual_small_span_bytes: usize, + pub small_allocations: u64, + pub small_deallocations: u64, + pub large_allocations: u64, + pub large_deallocations: u64, + pub compact_calls: u64, + pub meshes_performed: u64, + pub meshed_pages: u64, + pub meshed_bytes: u64, + pub compaction: CompactionEstimate, +} + +impl MeshStats { + #[inline(always)] + pub const fn live_bytes(&self) -> usize { + self.live_small_bytes + self.live_large_bytes + } + + #[inline(always)] + pub const fn retained_bytes(&self) -> usize { + self.retained_small_span_bytes + self.retained_large_span_bytes + } + + #[inline(always)] + pub const fn small_fragmentation_bytes(&self) -> usize { + self.retained_small_span_bytes + .saturating_sub(self.live_small_bytes) + } + + #[inline(always)] + pub const fn mesh_alias_bytes(&self) -> usize { + self.virtual_small_span_bytes + .saturating_sub(self.retained_small_span_bytes) + } + + pub fn compaction_advice(&self) -> CompactionAdvice { + let fragmented = self.small_fragmentation_bytes(); + let retained = self.retained_small_span_bytes; + let fragmentation_percent = fragmented + .saturating_mul(100) + .checked_div(retained) + .unwrap_or(0) + .min(100) as u8; + + let reclaimable = self.compaction.best_case_reclaimable_bytes; + let page = page_size(); + let recommendation = if self.compaction.best_case_meshes == 0 || reclaimable < page { + CompactionRecommendation::Idle + } else if reclaimable >= page * 4 + && (fragmentation_percent >= 20 || reclaimable.saturating_mul(4) >= retained.max(page)) + { + CompactionRecommendation::Compact + } else if reclaimable >= page + && (fragmentation_percent >= 10 || self.compaction.candidate_heaps >= 2) + { + CompactionRecommendation::Consider + } else { + CompactionRecommendation::Idle + }; + + CompactionAdvice { + recommendation, + fragmentation_bytes: fragmented, + fragmentation_percent, + candidate_heaps: self.compaction.candidate_heaps, + candidate_free_bytes: self.compaction.candidate_free_bytes, + best_case_meshes: self.compaction.best_case_meshes, + best_case_reclaimable_pages: self.compaction.best_case_reclaimable_pages, + best_case_reclaimable_bytes: reclaimable, + } + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct CompactionAdvice { + pub recommendation: CompactionRecommendation, + pub fragmentation_bytes: usize, + pub fragmentation_percent: u8, + pub candidate_heaps: u32, + pub candidate_free_bytes: usize, + pub best_case_meshes: u32, + pub best_case_reclaimable_pages: u32, + pub best_case_reclaimable_bytes: usize, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum CompactionRecommendation { + #[default] + Idle, + Consider, + Compact, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct RuntimeCompactionPolicy { + pub minimum_recommendation: CompactionRecommendation, + pub min_fragmentation_bytes: usize, + pub min_reclaimable_bytes: usize, + pub min_candidate_heaps: u32, +} + +impl RuntimeCompactionPolicy { + pub fn should_compact(&self, advice: &CompactionAdvice) -> bool { + recommendation_rank(advice.recommendation) + >= recommendation_rank(self.minimum_recommendation) + && advice.fragmentation_bytes >= self.min_fragmentation_bytes + && advice.best_case_reclaimable_bytes >= self.min_reclaimable_bytes + && advice.candidate_heaps >= self.min_candidate_heaps + } +} + +impl Default for RuntimeCompactionPolicy { + fn default() -> Self { + let page = page_size(); + Self { + minimum_recommendation: CompactionRecommendation::Consider, + min_fragmentation_bytes: page, + min_reclaimable_bytes: page, + min_candidate_heaps: 2, + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum RuntimeCompactionResult { + Compacted { + meshes: usize, + advice: CompactionAdvice, + }, + Skipped { + reason: CompactionSkipReason, + advice: Option, + }, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum CompactionSkipReason { + AllocatorUnavailable, + ThreadUnavailable, + NotAtSafepoint, + ThreadsActive, + Policy, +} + +#[inline(always)] +const fn recommendation_rank(recommendation: CompactionRecommendation) -> u8 { + match recommendation { + CompactionRecommendation::Idle => 0, + CompactionRecommendation::Consider => 1, + CompactionRecommendation::Compact => 2, + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct CounterSnapshot { + pub small_allocations: u64, + pub small_deallocations: u64, + pub large_allocations: u64, + pub large_deallocations: u64, + pub compact_calls: u64, + pub meshes_performed: u64, + pub meshed_pages: u64, + pub meshed_bytes: u64, +} + +#[derive(Debug)] +pub(crate) struct StatsState { + small_allocations: AtomicU64, + small_deallocations: AtomicU64, + large_allocations: AtomicU64, + large_deallocations: AtomicU64, + compact_calls: AtomicU64, + meshes_performed: AtomicU64, + meshed_pages: AtomicU64, + meshed_bytes: AtomicU64, +} + +impl StatsState { + pub const fn new() -> Self { + Self { + small_allocations: AtomicU64::new(0), + small_deallocations: AtomicU64::new(0), + large_allocations: AtomicU64::new(0), + large_deallocations: AtomicU64::new(0), + compact_calls: AtomicU64::new(0), + meshes_performed: AtomicU64::new(0), + meshed_pages: AtomicU64::new(0), + meshed_bytes: AtomicU64::new(0), + } + } + + #[inline(always)] + pub fn record_small_allocation(&self) { + self.small_allocations.fetch_add(1, Ordering::Relaxed); + } + + #[inline(always)] + pub fn record_small_deallocation(&self) { + self.small_deallocations.fetch_add(1, Ordering::Relaxed); + } + + #[inline(always)] + pub fn record_large_allocation(&self) { + self.large_allocations.fetch_add(1, Ordering::Relaxed); + } + + #[inline(always)] + pub fn record_large_deallocation(&self) { + self.large_deallocations.fetch_add(1, Ordering::Relaxed); + } + + #[inline(always)] + pub fn record_compact_call(&self) { + self.compact_calls.fetch_add(1, Ordering::Relaxed); + } + + #[inline(always)] + pub fn record_mesh(&self, pages: u32, bytes: usize) { + self.meshes_performed.fetch_add(1, Ordering::Relaxed); + self.meshed_pages.fetch_add(pages as u64, Ordering::Relaxed); + self.meshed_bytes.fetch_add(bytes as u64, Ordering::Relaxed); + } + + pub fn snapshot(&self) -> CounterSnapshot { + CounterSnapshot { + small_allocations: self.small_allocations.load(Ordering::Relaxed), + small_deallocations: self.small_deallocations.load(Ordering::Relaxed), + large_allocations: self.large_allocations.load(Ordering::Relaxed), + large_deallocations: self.large_deallocations.load(Ordering::Relaxed), + compact_calls: self.compact_calls.load(Ordering::Relaxed), + meshes_performed: self.meshes_performed.load(Ordering::Relaxed), + meshed_pages: self.meshed_pages.load(Ordering::Relaxed), + meshed_bytes: self.meshed_bytes.load(Ordering::Relaxed), + } + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/sync.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/sync.rs new file mode 100644 index 0000000..80274cb --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/sync.rs @@ -0,0 +1,137 @@ +use core::sync::atomic::{AtomicU32, Ordering}; + +use super::platform; +use super::raw_sys; + +const UNLOCKED: u32 = 0; +const LOCKED: u32 = 1; +const CONTENDED: u32 = 2; + +#[derive(Debug)] +pub struct FutexMutex { + state: AtomicU32, +} + +impl FutexMutex { + pub const fn new() -> Self { + Self { + state: AtomicU32::new(UNLOCKED), + } + } + + pub fn lock(&self) -> FutexMutexGuard<'_> { + if self + .state + .compare_exchange(UNLOCKED, LOCKED, Ordering::Acquire, Ordering::Relaxed) + .is_err() + { + self.lock_contended(); + } + + FutexMutexGuard { mutex: self } + } + + fn lock_contended(&self) { + loop { + if self.state.swap(CONTENDED, Ordering::Acquire) == UNLOCKED { + return; + } + + match unsafe { + platform::futex_wait(self.state_ptr(), raw_sys::FUTEX_WAIT_PRIVATE, CONTENDED) + } { + Ok(()) => {} + Err(error) if matches!(error.errno(), raw_sys::EAGAIN | raw_sys::EINTR) => {} + Err(_) => {} + } + } + } + + fn unlock(&self) { + if self.state.fetch_sub(1, Ordering::Release) != LOCKED { + self.state.store(UNLOCKED, Ordering::Release); + let _ = + unsafe { platform::futex_wake(self.state_ptr(), raw_sys::FUTEX_WAKE_PRIVATE, 1) }; + } + } + + #[inline(always)] + fn state_ptr(&self) -> *const u32 { + (&self.state as *const AtomicU32).cast::() + } +} + +impl Default for FutexMutex { + fn default() -> Self { + Self::new() + } +} + +pub struct FutexMutexGuard<'a> { + mutex: &'a FutexMutex, +} + +impl Drop for FutexMutexGuard<'_> { + fn drop(&mut self) { + self.mutex.unlock(); + } +} + +pub(crate) fn futex_wait_for_value(state: &AtomicU32, expected: u32) { + match unsafe { + platform::futex_wait( + (state as *const AtomicU32).cast::(), + raw_sys::FUTEX_WAIT_PRIVATE, + expected, + ) + } { + Ok(()) => {} + Err(error) if matches!(error.errno(), raw_sys::EAGAIN | raw_sys::EINTR) => {} + Err(_) => {} + } +} + +pub(crate) fn futex_wake_all(state: &AtomicU32) { + let _ = unsafe { + platform::futex_wake( + (state as *const AtomicU32).cast::(), + raw_sys::FUTEX_WAKE_PRIVATE, + i32::MAX as u32, + ) + }; +} + +#[cfg(test)] +mod tests { + use core::sync::atomic::{AtomicUsize, Ordering}; + + use std::sync::Arc; + use std::thread; + use std::vec::Vec; + + use super::FutexMutex; + + #[test] + fn futex_mutex_serializes_multiple_threads() { + let mutex = Arc::new(FutexMutex::new()); + let counter = Arc::new(AtomicUsize::new(0)); + let mut threads = Vec::new(); + + for _ in 0..4 { + let mutex = Arc::clone(&mutex); + let counter = Arc::clone(&counter); + threads.push(thread::spawn(move || { + for _ in 0..5000 { + let _guard = mutex.lock(); + counter.fetch_add(1, Ordering::Relaxed); + } + })); + } + + for thread in threads { + thread.join().unwrap(); + } + + assert_eq!(counter.load(Ordering::Acquire), 20_000); + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mesh_alloc/thread_local_heap.rs b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/thread_local_heap.rs new file mode 100644 index 0000000..e429517 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mesh_alloc/thread_local_heap.rs @@ -0,0 +1,124 @@ +use core::array; +use core::ptr::null; + +use super::constants::{MAX_ATTACHED_MINIHEAPS_PER_CLASS, MIN_OBJECT_SIZE, NUM_SIZE_CLASSES}; +use super::miniheap::{MiniHeap, MiniHeapId}; +use super::platform; +use super::raw_sys; +use super::rng::Mwc; +use super::shuffle::ShuffleVector; +use super::size_map::byte_size_for_class; + +#[derive(Clone, Copy, Debug)] +pub(crate) struct ClassState { + pub shuffle: ShuffleVector, + pub attached_ids: [MiniHeapId; MAX_ATTACHED_MINIHEAPS_PER_CLASS], + pub attached_heaps: [*const MiniHeap; MAX_ATTACHED_MINIHEAPS_PER_CLASS], + pub attached_len: u8, + pub attached_cursor: u8, +} + +impl ClassState { + fn new(object_size: usize, seed1: u64, seed2: u64) -> Self { + Self { + shuffle: ShuffleVector::for_object_size(object_size, seed1, seed2), + attached_ids: [MiniHeapId::new(0); MAX_ATTACHED_MINIHEAPS_PER_CLASS], + attached_heaps: [null(); MAX_ATTACHED_MINIHEAPS_PER_CLASS], + attached_len: 0, + attached_cursor: 0, + } + } + + #[inline(always)] + pub fn clear_attached(&mut self) { + self.attached_len = 0; + self.attached_cursor = 0; + self.shuffle.clear(); + + let mut index = 0usize; + while index < MAX_ATTACHED_MINIHEAPS_PER_CLASS { + self.attached_ids[index] = MiniHeapId::new(0); + self.attached_heaps[index] = null(); + index += 1; + } + } + + #[inline(always)] + pub fn attached_full(&self) -> bool { + self.attached_len as usize == MAX_ATTACHED_MINIHEAPS_PER_CLASS + } + + #[inline(always)] + pub fn push_attached(&mut self, id: MiniHeapId, heap: *const MiniHeap) -> Option { + if self.attached_full() { + return None; + } + let index = self.attached_len as usize; + self.attached_ids[index] = id; + self.attached_heaps[index] = heap; + self.attached_len += 1; + Some(index as u8) + } + + #[inline(always)] + pub fn find_attached(&self, id: MiniHeapId) -> Option { + let len = self.attached_len as usize; + let mut i = 0usize; + while i < len { + if self.attached_ids[i] == id { + return Some(i as u8); + } + i += 1; + } + None + } + + #[inline(always)] + pub fn heap_at(&self, index: usize) -> Option<&MiniHeap> { + if index >= self.attached_len as usize { + return None; + } + let heap = self.attached_heaps[index]; + if heap.is_null() { + return None; + } + + unsafe { Some(&*heap) } + } +} + +#[derive(Debug)] +pub struct ThreadLocalHeap { + thread_id: u32, + classes: [ClassState; NUM_SIZE_CLASSES], +} + +impl ThreadLocalHeap { + pub fn new() -> raw_sys::Result { + let thread_id = platform::gettid()?; + let mut seed_rng = Mwc::from_os_seed()?; + let classes = array::from_fn(|class| { + let object_size = byte_size_for_class(class as u8).max(MIN_OBJECT_SIZE); + let seed1 = seed_rng.next_u32() as u64 + 1; + let seed2 = seed_rng.next_u32() as u64 + 1; + ClassState::new(object_size, seed1, seed2) + }); + + Ok(Self { thread_id, classes }) + } + + #[inline(always)] + pub const fn thread_id(&self) -> u32 { + self.thread_id + } + + #[inline(always)] + pub(crate) fn class(&self, class: u8) -> &ClassState { + &self.classes[class as usize] + } + + #[inline(always)] + pub(crate) fn class_mut(&mut self, class: u8) -> &mut ClassState { + &mut self.classes[class as usize] + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/mod.rs b/lib/runtime/src/platform/linux_x86_64/mod.rs new file mode 100644 index 0000000..a6a1110 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/mod.rs @@ -0,0 +1,4 @@ +pub mod mesh_alloc; +pub mod reactor; +pub mod runtime; +pub(crate) mod uring; diff --git a/lib/runtime/src/platform/linux_x86_64/reactor.rs b/lib/runtime/src/platform/linux_x86_64/reactor.rs new file mode 100644 index 0000000..3a833cb --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/reactor.rs @@ -0,0 +1,340 @@ +use std::cell::Cell; +use std::cell::RefCell; +use std::collections::BTreeMap; +use std::io; +use std::os::fd::RawFd; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; + +use super::uring::{IORING_OP_ASYNC_CANCEL, IoUring, IoUringCqe, IoUringSqe}; + +const WAKE_TARGET_TOKEN: u64 = 1; +const TOKEN_KIND_SHIFT: u64 = 56; +const TOKEN_KIND_MASK: u64 = 0xff << TOKEN_KIND_SHIFT; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[repr(u8)] +enum CompletionKind { + Timer = 1, + TimerRemove = 2, + NotifySend = 3, + Operation = 4, + OperationCancel = 5, +} + +type CompletionHandler = Box; + +struct NotifierInner { + ring_fd: RawFd, + closed: AtomicBool, +} + +impl NotifierInner { + fn notify(&self) -> io::Result<()> { + if self.closed.load(Ordering::Acquire) { + return Err(io::Error::new( + io::ErrorKind::BrokenPipe, + "target runtime ring is closed", + )); + } + + IoUring::with_submitter(|ring| { + ring.submit_msg_ring( + self.ring_fd, + WAKE_TARGET_TOKEN, + 1, + make_token(CompletionKind::NotifySend, 0), + ) + }) + } +} + +#[derive(Clone)] +pub struct ThreadNotifier { + inner: Arc, +} + +impl ThreadNotifier { + pub fn notify(&self) -> io::Result<()> { + self.inner.notify() + } +} + +#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] +pub struct ReadyEvents { + pub timer: bool, + pub wake: bool, +} + +pub struct Reactor { + ring: IoUring, + notifier: Arc, + next_token: Cell, + active_timer_token: Cell>, + pending_wakes: Cell, + pending_timers: Cell, + completions: RefCell>, +} + +pub fn create() -> io::Result<(Reactor, ThreadNotifier)> { + create_reactor() +} + +pub fn create_reactor() -> io::Result<(Reactor, ThreadNotifier)> { + let ring = IoUring::new(64)?; + let notifier = Arc::new(NotifierInner { + ring_fd: ring.ring_fd(), + closed: AtomicBool::new(false), + }); + + Ok(( + Reactor { + ring, + notifier: Arc::clone(¬ifier), + next_token: Cell::new(1), + active_timer_token: Cell::new(None), + pending_wakes: Cell::new(0), + pending_timers: Cell::new(0), + completions: RefCell::new(BTreeMap::new()), + }, + ThreadNotifier { inner: notifier }, + )) +} + +impl Reactor { + pub(crate) fn bind_current_thread(&self) { + self.ring.bind_current_thread(); + } + + pub(crate) fn unbind_current_thread(&self) { + self.ring.unbind_current_thread(); + } + + pub fn poll(&self) -> io::Result> { + let mut ready = ReadyEvents::default(); + let saw_any = self + .ring + .drain_completions(|cqe| self.process_cqe(cqe, &mut ready)); + if saw_any { Ok(Some(ready)) } else { Ok(None) } + } + + pub fn wait(&self) -> io::Result<()> { + self.ring.wait_for_cqe() + } + + pub fn rearm_timer(&self, deadline: Option) -> io::Result<()> { + match (self.active_timer_token.get(), deadline) { + (Some(active), Some(deadline)) => { + self.ring.submit_timeout_update(active, deadline)?; + } + (Some(active), None) => { + self.active_timer_token.set(None); + self.ring + .submit_timeout_remove(active, self.next_token(CompletionKind::TimerRemove))?; + } + (None, Some(deadline)) => { + let token = self.next_token(CompletionKind::Timer); + self.active_timer_token.set(Some(token)); + self.ring.submit_timeout(token, deadline)?; + } + (None, None) => {} + } + + Ok(()) + } + + pub(crate) fn submit_operation( + &self, + fill: impl FnOnce(&mut IoUringSqe), + on_complete: impl FnOnce(IoUringCqe) + Send + 'static, + ) -> io::Result { + let token = self.next_token(CompletionKind::Operation); + self.completions + .borrow_mut() + .insert(token, Box::new(on_complete)); + + if let Err(error) = self.ring.submit_with_token(token, fill) { + let _ = self.completions.borrow_mut().remove(&token); + return Err(error); + } + + Ok(token) + } + + pub(crate) fn cancel_operation(&self, token: u64) -> io::Result<()> { + self.ring + .submit_with_token(self.next_token(CompletionKind::OperationCancel), |sqe| { + sqe.opcode = IORING_OP_ASYNC_CANCEL; + sqe.fd = -1; + sqe.addr = token; + }) + } + + pub fn drain_wake(&self) -> io::Result { + let wakes = self.pending_wakes.replace(0); + if wakes == 0 { + Err(io::Error::new( + io::ErrorKind::WouldBlock, + "no wake completions are pending", + )) + } else { + Ok(wakes) + } + } + + pub fn drain_timer(&self) -> io::Result { + let timers = self.pending_timers.replace(0); + if timers == 0 { + Err(io::Error::new( + io::ErrorKind::WouldBlock, + "no timer completions are pending", + )) + } else { + Ok(timers) + } + } + + fn process_cqe(&self, cqe: IoUringCqe, ready: &mut ReadyEvents) { + if cqe.user_data == WAKE_TARGET_TOKEN { + ready.wake = true; + let wakes = cqe.res.max(1) as u64; + self.pending_wakes + .set(self.pending_wakes.get().saturating_add(wakes)); + return; + } + + match decode_token_kind(cqe.user_data) { + Some(CompletionKind::Timer) => { + if self.active_timer_token.get() == Some(cqe.user_data) { + self.active_timer_token.set(None); + } + if cqe.res == -libc::ETIME { + ready.timer = true; + self.pending_timers + .set(self.pending_timers.get().saturating_add(1)); + } + } + Some(CompletionKind::Operation) => { + if let Some(callback) = self.completions.borrow_mut().remove(&cqe.user_data) { + callback(cqe); + } + } + Some(CompletionKind::TimerRemove) + | Some(CompletionKind::NotifySend) + | Some(CompletionKind::OperationCancel) + | None => {} + } + } + + fn next_token(&self, kind: CompletionKind) -> u64 { + let seq = self.next_token.get(); + self.next_token.set(seq.wrapping_add(1)); + make_token(kind, seq) + } +} + +impl Drop for Reactor { + fn drop(&mut self) { + self.notifier.closed.store(true, Ordering::Release); + } +} + +pub fn monotonic_now() -> io::Result { + let mut now = std::mem::MaybeUninit::::uninit(); + let result = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, now.as_mut_ptr()) }; + if result == -1 { + return Err(io::Error::last_os_error()); + } + + let now = unsafe { now.assume_init() }; + Ok(Duration::new(now.tv_sec as u64, now.tv_nsec as u32)) +} + +fn make_token(kind: CompletionKind, seq: u64) -> u64 { + ((kind as u64) << TOKEN_KIND_SHIFT) | (seq & !TOKEN_KIND_MASK) +} + +fn decode_token_kind(token: u64) -> Option { + match ((token & TOKEN_KIND_MASK) >> TOKEN_KIND_SHIFT) as u8 { + 1 => Some(CompletionKind::Timer), + 2 => Some(CompletionKind::TimerRemove), + 3 => Some(CompletionKind::NotifySend), + 4 => Some(CompletionKind::Operation), + 5 => Some(CompletionKind::OperationCancel), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::{create_reactor, monotonic_now}; + use std::thread; + use std::time::Duration; + + #[test] + fn notifier_wakes_target_ring() { + let (sender, _) = create_reactor().expect("sender reactor should initialize"); + sender.bind_current_thread(); + + let (target, notifier) = create_reactor().expect("target reactor should initialize"); + notifier.notify().expect("notify should succeed"); + + let ready = loop { + if let Some(ready) = target.poll().expect("poll should succeed") { + break ready; + } + thread::sleep(Duration::from_millis(1)); + }; + + assert!(ready.wake); + assert!(!ready.timer); + assert_eq!(target.drain_wake().expect("wake drain should succeed"), 1); + sender.unbind_current_thread(); + } + + #[test] + fn notifier_wakes_target_ring_from_plain_thread() { + let (target, notifier) = create_reactor().expect("target reactor should initialize"); + + thread::spawn(move || { + notifier.notify().expect("notify should succeed"); + }) + .join() + .expect("notifier thread should exit cleanly"); + + let ready = loop { + if let Some(ready) = target.poll().expect("poll should succeed") { + break ready; + } + thread::sleep(Duration::from_millis(1)); + }; + + assert!(ready.wake); + assert!(!ready.timer); + assert_eq!(target.drain_wake().expect("wake drain should succeed"), 1); + } + + #[test] + fn timeout_reports_deadlines() { + let (reactor, _notifier) = create_reactor().expect("reactor should initialize"); + let deadline = monotonic_now().expect("clock should work") + Duration::from_millis(20); + reactor + .rearm_timer(Some(deadline)) + .expect("timer should arm"); + + let ready = loop { + if let Some(ready) = reactor.poll().expect("poll should succeed") { + break ready; + } + thread::sleep(Duration::from_millis(5)); + }; + + assert!(ready.timer); + assert!(!ready.wake); + assert_eq!( + reactor.drain_timer().expect("timer drain should succeed"), + 1 + ); + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/runtime.rs b/lib/runtime/src/platform/linux_x86_64/runtime.rs new file mode 100644 index 0000000..5deffd2 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/runtime.rs @@ -0,0 +1,1067 @@ +use std::cell::{Cell, RefCell}; +use std::collections::{BTreeMap, VecDeque}; +use std::future::Future; +use std::pin::Pin; +use std::ptr; +use std::rc::Rc; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, MutexGuard}; +use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker}; +use std::time::Duration; + +use super::reactor::{Reactor, ThreadNotifier, create as create_reactor, monotonic_now}; + +type LocalTask = Box; +type SendTask = Box; +type LocalBoxFuture = Pin + 'static>>; + +#[thread_local] +static mut CURRENT_THREAD: *mut ThreadState = ptr::null_mut(); + +#[derive(Clone)] +pub struct ThreadHandle { + shared: Arc, +} + +pub struct WorkerHandle { + thread: ThreadHandle, + completion: Arc, +} + +#[derive(Clone)] +pub struct TimeoutHandle { + id: usize, + owner: *const ThreadState, + _local: Rc<()>, +} + +#[derive(Clone)] +pub struct IntervalHandle { + id: usize, + owner: *const ThreadState, + _local: Rc<()>, +} + +pub struct JoinHandle { + state: Rc>, +} + +pub struct YieldNow { + yielded: bool, +} + +pub fn current_thread_handle() -> ThreadHandle { + current_thread().handle() +} + +pub(crate) fn try_current_thread_handle() -> Option { + unsafe { (!CURRENT_THREAD.is_null()).then(|| (*CURRENT_THREAD).handle()) } +} + +pub(crate) fn with_current_reactor(f: impl FnOnce(&Reactor) -> T) -> T { + f(¤t_thread().reactor) +} + +pub fn queue_task(task: F) +where + F: FnOnce() + 'static, +{ + push_local_macrotask(Box::new(task)); +} + +pub fn queue_microtask(task: F) +where + F: FnOnce() + 'static, +{ + current_thread() + .local_microtasks + .borrow_mut() + .push_back(Box::new(task)); +} + +pub fn set_timeout(delay: Duration, callback: F) -> TimeoutHandle +where + F: FnOnce() + 'static, +{ + let owner = current_thread_ptr(); + let id = allocate_timer_id(); + let deadline = deadline_from_now(delay); + let timer = TimerNode::timeout(id, deadline, Box::new(callback)); + + current_thread().timers.borrow_mut().insert(timer); + rearm_thread_timer(); + + TimeoutHandle { + id, + owner, + _local: Rc::new(()), + } +} + +pub fn clear_timeout(handle: &TimeoutHandle) { + clear_timer(handle.owner, handle.id); +} + +pub fn set_interval(delay: Duration, callback: F) -> IntervalHandle +where + F: FnMut() + 'static, +{ + let owner = current_thread_ptr(); + let id = allocate_timer_id(); + let deadline = deadline_from_now(delay); + let timer = TimerNode::interval( + id, + deadline, + delay, + Rc::new(RefCell::new(Box::new(callback))), + ); + + current_thread().timers.borrow_mut().insert(timer); + rearm_thread_timer(); + + IntervalHandle { + id, + owner, + _local: Rc::new(()), + } +} + +pub fn clear_interval(handle: &IntervalHandle) { + clear_timer(handle.owner, handle.id); +} + +pub fn queue_future(future: F) -> JoinHandle +where + F: Future + 'static, + F::Output: 'static, +{ + let state = Rc::new(JoinState::new()); + let completion = Rc::clone(&state); + let task = Rc::new(FutureTask { + future: RefCell::new(Some(Box::pin(async move { + let output = future.await; + completion.complete(output); + }))), + queued: Cell::new(false), + }); + + task.schedule(); + + JoinHandle { state } +} + +pub fn spawn_worker(initial_task: Init, on_exit: Exit) -> WorkerHandle +where + Init: FnOnce() + Send + 'static, + Exit: FnOnce() + 'static, +{ + let parent = current_thread(); + let (reactor, notifier) = create_reactor().expect("worker reactor should initialize"); + let shared = Arc::new(ThreadShared::new(notifier)); + let handle = ThreadHandle { + shared: Arc::clone(&shared), + }; + let completion = Arc::new(WorkerCompletion { + finished: AtomicBool::new(false), + parent_event: parent.handle(), + }); + + parent.children.borrow_mut().push(ChildWorker { + completion: Arc::clone(&completion), + on_exit: Some(Box::new(on_exit)), + }); + + let worker_completion = Arc::clone(&completion); + std::thread::Builder::new() + .name("ruin-runtime-worker".into()) + .spawn(move || { + install_thread(shared, reactor, Some(worker_completion)); + queue_task(initial_task); + run(); + }) + .expect("worker thread should spawn"); + + WorkerHandle { + thread: handle, + completion, + } +} + +pub fn run() { + let _ = current_thread(); + + loop { + drain_all(); + + while let Some(task) = pop_microtask() { + task(); + drain_all(); + } + + if let Some(task) = pop_macrotask() { + task(); + continue; + } + + drain_all(); + + if has_ready_work() { + continue; + } + + let state = current_thread(); + if !state.try_begin_shutdown() { + continue; + } + + drain_all(); + + if has_ready_work() { + state.shared.closing.store(false, Ordering::Release); + continue; + } + + if has_pending_timers() || state.has_live_children() || state.has_live_async_operations() { + state.shared.closing.store(false, Ordering::Release); + state.reactor.wait().expect("reactor wait should succeed"); + continue; + } + + if let Some(completion) = &state.worker_completion { + completion.finished.store(true, Ordering::Release); + completion.parent_event.shared.notify(); + } + + state.shared.closed.store(true, Ordering::Release); + state.shared.notify(); + teardown_thread(); + return; + } +} + +fn drain_all() { + drain_reactor_events(); + drain_remote_tasks(); + drain_completed_workers(); +} + +pub fn yield_now() -> YieldNow { + YieldNow { yielded: false } +} + +impl ThreadHandle { + pub fn queue_task(&self, task: F) -> bool + where + F: FnOnce() + Send + 'static, + { + self.shared.enqueue_macro(Box::new(task)) + } + + pub fn queue_microtask(&self, task: F) -> bool + where + F: FnOnce() + Send + 'static, + { + self.shared.enqueue_micro(Box::new(task)) + } + + pub fn is_closed(&self) -> bool { + self.shared.closed.load(Ordering::Acquire) + } + + #[allow(dead_code)] + pub(crate) fn begin_async_operation(&self) { + self.shared.pending_ops.fetch_add(1, Ordering::AcqRel); + } + + #[allow(dead_code)] + pub(crate) fn finish_async_operation(&self) { + let previous = self.shared.pending_ops.fetch_sub(1, Ordering::AcqRel); + debug_assert!(previous > 0, "async operation count underflow"); + self.shared.notify(); + } +} + +impl WorkerHandle { + pub fn queue_task(&self, task: F) -> bool + where + F: FnOnce() + Send + 'static, + { + self.thread.queue_task(task) + } + + pub fn queue_microtask(&self, task: F) -> bool + where + F: FnOnce() + Send + 'static, + { + self.thread.queue_microtask(task) + } + + pub fn is_finished(&self) -> bool { + self.completion.finished.load(Ordering::Acquire) + } + + pub fn thread(&self) -> ThreadHandle { + self.thread.clone() + } +} + +impl Future for JoinHandle { + type Output = T; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + self.state.poll(cx) + } +} + +impl Future for YieldNow { + type Output = (); + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + if self.yielded { + Poll::Ready(()) + } else { + self.yielded = true; + cx.waker().wake_by_ref(); + Poll::Pending + } + } +} + +struct ThreadState { + reactor: Reactor, + shared: Arc, + worker_completion: Option>, + local_microtasks: RefCell>, + local_macrotasks: RefCell>, + timers: RefCell, + next_timer_id: Cell, + children: RefCell>, +} + +impl ThreadState { + fn new( + shared: Arc, + reactor: Reactor, + worker_completion: Option>, + ) -> Self { + Self { + reactor, + shared, + worker_completion, + local_microtasks: RefCell::new(VecDeque::new()), + local_macrotasks: RefCell::new(VecDeque::new()), + timers: RefCell::new(TimerHeap::new()), + next_timer_id: Cell::new(1), + children: RefCell::new(Vec::new()), + } + } + + fn handle(&self) -> ThreadHandle { + ThreadHandle { + shared: Arc::clone(&self.shared), + } + } + + fn has_live_children(&self) -> bool { + !self.children.borrow().is_empty() + } + + fn has_live_async_operations(&self) -> bool { + self.shared.pending_ops.load(Ordering::Acquire) != 0 + } + + fn try_begin_shutdown(&self) -> bool { + self.shared + .closing + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + } +} + +struct ThreadShared { + notifier: ThreadNotifier, + remote_microtasks: Mutex>, + remote_macrotasks: Mutex>, + pending_ops: AtomicUsize, + closing: AtomicBool, + closed: AtomicBool, +} + +impl ThreadShared { + fn new(notifier: ThreadNotifier) -> Self { + Self { + notifier, + remote_microtasks: Mutex::new(VecDeque::new()), + remote_macrotasks: Mutex::new(VecDeque::new()), + pending_ops: AtomicUsize::new(0), + closing: AtomicBool::new(false), + closed: AtomicBool::new(false), + } + } + + fn enqueue_micro(&self, task: SendTask) -> bool { + if self.closed.load(Ordering::Acquire) { + return false; + } + + lock_queue(&self.remote_microtasks).push_back(task); + self.notify(); + true + } + + fn enqueue_macro(&self, task: SendTask) -> bool { + if self.closed.load(Ordering::Acquire) { + return false; + } + + lock_queue(&self.remote_macrotasks).push_back(task); + self.notify(); + true + } + + fn notify(&self) { + self.notifier + .notify() + .expect("thread notifier should succeed"); + } +} + +struct ChildWorker { + completion: Arc, + on_exit: Option, +} + +struct WorkerCompletion { + finished: AtomicBool, + parent_event: ThreadHandle, +} + +struct TimerNode { + id: usize, + deadline: Duration, + kind: TimerKind, +} + +enum TimerKind { + Timeout(LocalTask), + Interval { + interval: Duration, + callback: Rc>>, + }, +} + +impl TimerNode { + fn timeout(id: usize, deadline: Duration, callback: LocalTask) -> Self { + Self { + id, + deadline, + kind: TimerKind::Timeout(callback), + } + } + + fn interval( + id: usize, + deadline: Duration, + interval: Duration, + callback: Rc>>, + ) -> Self { + Self { + id, + deadline, + kind: TimerKind::Interval { interval, callback }, + } + } +} + +struct TimerHeap { + nodes: Vec, + positions: BTreeMap, +} + +impl TimerHeap { + fn new() -> Self { + Self { + nodes: Vec::new(), + positions: BTreeMap::new(), + } + } + + fn is_empty(&self) -> bool { + self.nodes.is_empty() + } + + fn peek_deadline(&self) -> Option { + self.nodes.first().map(|node| node.deadline) + } + + fn insert(&mut self, node: TimerNode) { + let index = self.nodes.len(); + self.positions.insert(node.id, index); + self.nodes.push(node); + self.sift_up(index); + } + + fn remove(&mut self, id: usize) -> Option { + let index = *self.positions.get(&id)?; + self.positions.remove(&id); + + let last = self.nodes.pop().expect("heap index should be valid"); + if index == self.nodes.len() { + return Some(last); + } + + let removed = std::mem::replace(&mut self.nodes[index], last); + self.positions.insert(self.nodes[index].id, index); + self.fix(index); + Some(removed) + } + + fn pop_min(&mut self) -> Option { + let id = self.nodes.first()?.id; + self.remove(id) + } + + fn pop_due(&mut self, now: Duration) -> Vec { + let mut due = Vec::new(); + while self.peek_deadline().is_some_and(|deadline| deadline <= now) { + due.push(self.pop_min().expect("timer heap should contain a minimum")); + } + due + } + + fn fix(&mut self, index: usize) { + if index > 0 && self.less(index, parent(index)) { + self.sift_up(index); + } else { + self.sift_down(index); + } + } + + fn sift_up(&mut self, mut index: usize) { + while index > 0 { + let parent = parent(index); + if !self.less(index, parent) { + break; + } + self.swap(index, parent); + index = parent; + } + } + + fn sift_down(&mut self, mut index: usize) { + loop { + let left = index * 2 + 1; + let right = left + 1; + let mut smallest = index; + + if left < self.nodes.len() && self.less(left, smallest) { + smallest = left; + } + if right < self.nodes.len() && self.less(right, smallest) { + smallest = right; + } + if smallest == index { + break; + } + + self.swap(index, smallest); + index = smallest; + } + } + + fn less(&self, lhs: usize, rhs: usize) -> bool { + let left = &self.nodes[lhs]; + let right = &self.nodes[rhs]; + (left.deadline, left.id) < (right.deadline, right.id) + } + + fn swap(&mut self, lhs: usize, rhs: usize) { + self.nodes.swap(lhs, rhs); + self.positions.insert(self.nodes[lhs].id, lhs); + self.positions.insert(self.nodes[rhs].id, rhs); + } +} + +struct FutureTask { + future: RefCell>, + queued: Cell, +} + +impl FutureTask { + fn schedule(self: &Rc) { + if self.queued.replace(true) { + return; + } + + let task = Rc::clone(self); + queue_microtask(move || task.poll()); + } + + fn poll(self: Rc) { + self.queued.set(false); + + let Some(mut future) = self.future.borrow_mut().take() else { + return; + }; + + let waker = self.waker(); + let mut context = Context::from_waker(&waker); + match future.as_mut().poll(&mut context) { + Poll::Ready(()) => {} + Poll::Pending => { + *self.future.borrow_mut() = Some(future); + } + } + } + + fn waker(self: &Rc) -> Waker { + unsafe { + Waker::from_raw(RawWaker::new( + Rc::into_raw(Rc::clone(self)).cast::<()>(), + &FUTURE_TASK_WAKER_VTABLE, + )) + } + } +} + +static FUTURE_TASK_WAKER_VTABLE: RawWakerVTable = RawWakerVTable::new( + future_task_clone, + future_task_wake, + future_task_wake_by_ref, + future_task_drop, +); + +struct JoinState { + result: RefCell>, + waker: RefCell>, + ready: Cell, +} + +impl JoinState { + fn new() -> Self { + Self { + result: RefCell::new(None), + waker: RefCell::new(None), + ready: Cell::new(false), + } + } + + fn complete(&self, value: T) { + *self.result.borrow_mut() = Some(value); + self.ready.set(true); + + if let Some(waker) = self.waker.borrow_mut().take() { + waker.wake(); + } + } + + fn poll(&self, cx: &mut Context<'_>) -> Poll { + if self.ready.get() { + return Poll::Ready( + self.result + .borrow_mut() + .take() + .expect("join handle polled after completion"), + ); + } + + *self.waker.borrow_mut() = Some(cx.waker().clone()); + Poll::Pending + } +} + +unsafe fn future_task_clone(data: *const ()) -> RawWaker { + let task = unsafe { Rc::::from_raw(data.cast::()) }; + let clone = Rc::clone(&task); + let _ = Rc::into_raw(task); + RawWaker::new(Rc::into_raw(clone).cast::<()>(), &FUTURE_TASK_WAKER_VTABLE) +} + +unsafe fn future_task_wake(data: *const ()) { + let task = unsafe { Rc::::from_raw(data.cast::()) }; + task.schedule(); +} + +unsafe fn future_task_wake_by_ref(data: *const ()) { + let task = unsafe { Rc::::from_raw(data.cast::()) }; + task.schedule(); + let _ = Rc::into_raw(task); +} + +unsafe fn future_task_drop(data: *const ()) { + drop(unsafe { Rc::::from_raw(data.cast::()) }); +} + +fn current_thread() -> &'static ThreadState { + unsafe { + if CURRENT_THREAD.is_null() { + let (reactor, notifier) = create_reactor().expect("runtime reactor should initialize"); + let shared = Arc::new(ThreadShared::new(notifier)); + let state = Box::new(ThreadState::new(shared, reactor, None)); + let state = Box::into_raw(state); + (*state).reactor.bind_current_thread(); + CURRENT_THREAD = state; + } + + &*CURRENT_THREAD + } +} + +fn current_thread_ptr() -> *const ThreadState { + current_thread() as *const ThreadState +} + +fn install_thread( + shared: Arc, + reactor: Reactor, + worker_completion: Option>, +) { + unsafe { + debug_assert!(CURRENT_THREAD.is_null(), "thread runtime already installed"); + let state = Box::new(ThreadState::new(shared, reactor, worker_completion)); + let state = Box::into_raw(state); + (*state).reactor.bind_current_thread(); + CURRENT_THREAD = state; + } +} + +fn teardown_thread() { + unsafe { + let state = CURRENT_THREAD; + CURRENT_THREAD = ptr::null_mut(); + + if !state.is_null() { + (*state).reactor.unbind_current_thread(); + drop(Box::from_raw(state)); + } + } +} + +fn drain_reactor_events() { + loop { + let ready = current_thread() + .reactor + .poll() + .expect("reactor poll should succeed"); + + let Some(ready) = ready else { + break; + }; + + let state = current_thread(); + if ready.wake { + let _ = state + .reactor + .drain_wake() + .expect("wake drain should succeed"); + } + if ready.timer { + let _ = state + .reactor + .drain_timer() + .expect("timer drain should succeed"); + dispatch_expired_timers(); + } + } +} + +fn drain_remote_tasks() { + let state = current_thread(); + + { + let mut local = state.local_microtasks.borrow_mut(); + let mut remote = lock_queue(&state.shared.remote_microtasks); + while let Some(task) = remote.pop_front() { + local.push_back(task); + } + } + + { + let mut local = state.local_macrotasks.borrow_mut(); + let mut remote = lock_queue(&state.shared.remote_macrotasks); + while let Some(task) = remote.pop_front() { + local.push_back(task); + } + } +} + +fn drain_completed_workers() { + let state = current_thread(); + let mut exited = Vec::new(); + + { + let mut children = state.children.borrow_mut(); + let mut index = 0; + while index < children.len() { + if children[index].completion.finished.load(Ordering::Acquire) { + let child = children.swap_remove(index); + exited.push(child); + } else { + index += 1; + } + } + } + + if exited.is_empty() { + return; + } + + let mut local = state.local_macrotasks.borrow_mut(); + for mut child in exited { + if let Some(task) = child.on_exit.take() { + local.push_back(task); + } + } +} + +fn pop_microtask() -> Option { + current_thread().local_microtasks.borrow_mut().pop_front() +} + +fn pop_macrotask() -> Option { + current_thread().local_macrotasks.borrow_mut().pop_front() +} + +fn push_local_macrotask(task: LocalTask) { + current_thread() + .local_macrotasks + .borrow_mut() + .push_back(task); +} + +fn has_ready_work() -> bool { + let state = current_thread(); + if !state.local_microtasks.borrow().is_empty() || !state.local_macrotasks.borrow().is_empty() { + return true; + } + + if !lock_queue(&state.shared.remote_microtasks).is_empty() + || !lock_queue(&state.shared.remote_macrotasks).is_empty() + { + return true; + } + + false +} + +fn has_pending_timers() -> bool { + !current_thread().timers.borrow().is_empty() +} + +fn allocate_timer_id() -> usize { + let state = current_thread(); + let id = state.next_timer_id.get(); + state.next_timer_id.set(id.wrapping_add(1)); + id +} + +fn clear_timer(owner: *const ThreadState, id: usize) { + assert!( + ptr::eq(current_thread_ptr(), owner), + "timer handles must be cleared on their originating thread" + ); + + if current_thread().timers.borrow_mut().remove(id).is_some() { + rearm_thread_timer(); + } +} + +fn dispatch_expired_timers() { + let now = deadline_from_now(Duration::ZERO); + let due = current_thread().timers.borrow_mut().pop_due(now); + + if due.is_empty() { + rearm_thread_timer(); + return; + } + + for mut timer in due { + match timer.kind { + TimerKind::Timeout(callback) => push_local_macrotask(callback), + TimerKind::Interval { interval, callback } => { + let mut next_deadline = timer.deadline; + + loop { + let queued_callback = Rc::clone(&callback); + push_local_macrotask(Box::new(move || { + (queued_callback.borrow_mut())(); + })); + + next_deadline = next_deadline.checked_add(interval).unwrap_or(Duration::MAX); + if next_deadline > now || next_deadline == Duration::MAX { + break; + } + } + + timer.deadline = next_deadline; + timer.kind = TimerKind::Interval { interval, callback }; + current_thread().timers.borrow_mut().insert(timer); + } + } + } + + rearm_thread_timer(); +} + +fn rearm_thread_timer() { + let deadline = current_thread().timers.borrow().peek_deadline(); + current_thread() + .reactor + .rearm_timer(deadline) + .expect("timerfd rearm should succeed"); +} + +fn deadline_from_now(delay: Duration) -> Duration { + monotonic_now() + .expect("monotonic clock should be available") + .checked_add(delay) + .unwrap_or(Duration::MAX) +} + +const fn parent(index: usize) -> usize { + (index - 1) / 2 +} + +fn lock_queue(queue: &Mutex>) -> MutexGuard<'_, VecDeque> { + queue.lock().expect("runtime queue poisoned") +} + +#[cfg(test)] +mod tests { + use crate::op::completion::completion_for_current_thread; + + use super::{ + clear_interval, current_thread_handle, queue_future, queue_microtask, queue_task, run, + set_interval, set_timeout, spawn_worker, yield_now, + }; + use std::cell::{Cell, RefCell}; + use std::rc::Rc; + use std::sync::{Arc, Mutex}; + use std::thread; + use std::time::Duration; + + #[test] + fn runtime_executes_local_and_remote_work() { + let log = Arc::new(Mutex::new(Vec::::new())); + let main_handle = current_thread_handle(); + + { + let log = Arc::clone(&log); + queue_task(move || log.lock().unwrap().push("main task".into())); + } + { + let log = Arc::clone(&log); + queue_microtask(move || log.lock().unwrap().push("main microtask".into())); + } + { + let log = Arc::clone(&log); + queue_future(async move { + log.lock().unwrap().push("main future start".into()); + yield_now().await; + log.lock().unwrap().push("main future end".into()); + }); + } + { + let log = Arc::clone(&log); + set_timeout(Duration::from_millis(5), move || { + log.lock().unwrap().push("main timeout".into()); + }); + } + { + let log = Arc::clone(&log); + let handle_slot = Rc::new(RefCell::new(None)); + let handle_slot_clone = Rc::clone(&handle_slot); + let tick_count = Rc::new(Cell::new(0usize)); + let tick_count_clone = Rc::clone(&tick_count); + let interval = set_interval(Duration::from_millis(3), move || { + let next = tick_count_clone.get() + 1; + tick_count_clone.set(next); + log.lock().unwrap().push(format!("main interval {next}")); + if next == 2 { + let handle = handle_slot_clone.borrow_mut().take().unwrap(); + clear_interval(&handle); + } + }); + *handle_slot.borrow_mut() = Some(interval); + } + + { + let worker_log = Arc::clone(&log); + let exit_log = Arc::clone(&log); + let main_handle_for_worker = main_handle.clone(); + spawn_worker( + move || { + let log = Arc::clone(&worker_log); + queue_task({ + let log = Arc::clone(&log); + move || log.lock().unwrap().push("worker task".into()) + }); + queue_microtask({ + let log = Arc::clone(&log); + move || log.lock().unwrap().push("worker microtask".into()) + }); + queue_future({ + let log = Arc::clone(&log); + async move { + log.lock().unwrap().push("worker future start".into()); + yield_now().await; + log.lock().unwrap().push("worker future end".into()); + } + }); + set_timeout(Duration::from_millis(7), move || { + main_handle_for_worker.queue_task({ + let log = Arc::clone(&log); + move || log.lock().unwrap().push("worker timeout to main".into()) + }); + }); + }, + { + let log = Arc::clone(&exit_log); + move || log.lock().unwrap().push("worker exit".into()) + }, + ); + } + + run(); + + let log = log.lock().unwrap(); + assert!(log.iter().any(|entry| entry == "main task")); + assert!(log.iter().any(|entry| entry == "main microtask")); + assert!(log.iter().any(|entry| entry == "main future start")); + assert!(log.iter().any(|entry| entry == "main future end")); + assert!(log.iter().any(|entry| entry == "main timeout")); + assert!(log.iter().any(|entry| entry == "main interval 1")); + assert!(log.iter().any(|entry| entry == "main interval 2")); + assert!(log.iter().any(|entry| entry == "worker task")); + assert!(log.iter().any(|entry| entry == "worker microtask")); + assert!(log.iter().any(|entry| entry == "worker future start")); + assert!(log.iter().any(|entry| entry == "worker future end")); + assert!(log.iter().any(|entry| entry == "worker timeout to main")); + assert!(log.iter().any(|entry| entry == "worker exit")); + } + + #[test] + fn runtime_waits_for_cross_thread_operation_completion() { + let observed = Arc::new(Mutex::new(None::)); + + { + let observed = Arc::clone(&observed); + queue_task(move || { + let (completion, source) = completion_for_current_thread::(); + + thread::spawn(move || { + source.complete(7); + }); + + queue_future(async move { + let value = completion.await; + *observed.lock().unwrap() = Some(value); + }); + }); + } + + run(); + + assert_eq!(*observed.lock().unwrap(), Some(7)); + } +} diff --git a/lib/runtime/src/platform/linux_x86_64/uring.rs b/lib/runtime/src/platform/linux_x86_64/uring.rs new file mode 100644 index 0000000..801a2f1 --- /dev/null +++ b/lib/runtime/src/platform/linux_x86_64/uring.rs @@ -0,0 +1,478 @@ +use std::cell::Cell; +use std::io; +use std::os::fd::RawFd; +use std::ptr; +use std::sync::atomic::{Ordering, compiler_fence}; +use std::sync::{Mutex, OnceLock}; +use std::time::Duration; + +const IORING_OFF_SQ_RING: libc::off_t = 0; +const IORING_OFF_CQ_RING: libc::off_t = 0x0800_0000; +const IORING_OFF_SQES: libc::off_t = 0x1000_0000; + +const IORING_ENTER_GETEVENTS: u32 = 1 << 0; +const IORING_SETUP_CLAMP: u32 = 1 << 4; + +const IORING_FEAT_SINGLE_MMAP: u32 = 1 << 0; + +pub(crate) const IORING_OP_FSYNC: u8 = 3; +pub(crate) const IORING_OP_TIMEOUT: u8 = 11; +pub(crate) const IORING_OP_TIMEOUT_REMOVE: u8 = 12; +pub(crate) const IORING_OP_ACCEPT: u8 = 13; +pub(crate) const IORING_OP_ASYNC_CANCEL: u8 = 14; +pub(crate) const IORING_OP_CONNECT: u8 = 16; +pub(crate) const IORING_OP_OPENAT: u8 = 18; +pub(crate) const IORING_OP_CLOSE: u8 = 19; +pub(crate) const IORING_OP_STATX: u8 = 21; +pub(crate) const IORING_OP_READ: u8 = 22; +pub(crate) const IORING_OP_WRITE: u8 = 23; +pub(crate) const IORING_OP_SEND: u8 = 26; +pub(crate) const IORING_OP_RECV: u8 = 27; +pub(crate) const IORING_OP_SHUTDOWN: u8 = 34; +pub(crate) const IORING_OP_RENAMEAT: u8 = 35; +pub(crate) const IORING_OP_UNLINKAT: u8 = 36; +pub(crate) const IORING_OP_MKDIRAT: u8 = 37; +pub(crate) const IORING_OP_MSG_RING: u8 = 40; +pub(crate) const IORING_OP_SOCKET: u8 = 45; +pub(crate) const IORING_OP_FTRUNCATE: u8 = 55; +pub(crate) const IORING_OP_BIND: u8 = 56; +pub(crate) const IORING_OP_LISTEN: u8 = 57; + +const IORING_MSG_DATA: u64 = 0; +pub(crate) const IORING_FSYNC_DATASYNC: u32 = 1 << 0; +pub(crate) const IORING_TIMEOUT_ABS: u32 = 1 << 0; +pub(crate) const IORING_TIMEOUT_UPDATE: u32 = 1 << 1; +pub(crate) const IOSQE_CQE_SKIP_SUCCESS: u8 = 1 << 6; + +thread_local! { + static CURRENT_SUBMITTER: Cell<*const IoUring> = const { Cell::new(ptr::null()) }; +} + +static GLOBAL_SUBMITTER: OnceLock>> = OnceLock::new(); + +#[repr(C)] +#[derive(Default, Clone, Copy)] +struct IoSqringOffsets { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + flags: u32, + dropped: u32, + array: u32, + resv1: u32, + user_addr: u64, +} + +#[repr(C)] +#[derive(Default, Clone, Copy)] +struct IoCqringOffsets { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + overflow: u32, + cqes: u32, + flags: u32, + resv1: u32, + user_addr: u64, +} + +#[repr(C)] +#[derive(Default, Clone, Copy)] +struct IoUringParams { + sq_entries: u32, + cq_entries: u32, + flags: u32, + sq_thread_cpu: u32, + sq_thread_idle: u32, + features: u32, + wq_fd: u32, + resv: [u32; 3], + sq_off: IoSqringOffsets, + cq_off: IoCqringOffsets, +} + +#[repr(C)] +#[derive(Default, Clone, Copy)] +pub(crate) struct IoUringSqe { + pub(crate) opcode: u8, + pub(crate) flags: u8, + pub(crate) ioprio: u16, + pub(crate) fd: i32, + pub(crate) off: u64, + pub(crate) addr: u64, + pub(crate) len: u32, + pub(crate) op_flags: u32, + pub(crate) user_data: u64, + pub(crate) buf_index: u16, + pub(crate) personality: u16, + pub(crate) file_index: i32, + pub(crate) pad2: [u64; 2], +} + +#[repr(C)] +#[derive(Default, Clone, Copy)] +pub(crate) struct IoUringCqe { + pub(crate) user_data: u64, + pub(crate) res: i32, + pub(crate) flags: u32, +} + +#[repr(C)] +#[derive(Default, Clone, Copy)] +struct KernelTimespec { + tv_sec: i64, + tv_nsec: i64, +} + +pub(crate) struct IoUring { + ring_fd: RawFd, + sq_ring_ptr: *mut u8, + cq_ring_ptr: *mut u8, + sqes_ptr: *mut IoUringSqe, + sq_ring_size: usize, + cq_ring_size: usize, + sqes_size: usize, + single_mmap: bool, + sq_head: *mut u32, + sq_tail: *mut u32, + sq_ring_mask: *mut u32, + sq_ring_entries: *mut u32, + sq_array: *mut u32, + cq_head: *mut u32, + cq_tail: *mut u32, + cq_ring_mask: *mut u32, + cqes: *mut IoUringCqe, +} + +impl IoUring { + pub(crate) fn new(entries: u32) -> io::Result { + let mut params = IoUringParams { + flags: IORING_SETUP_CLAMP, + ..IoUringParams::default() + }; + + let ring_fd = cvt_long(unsafe { + libc::syscall( + libc::SYS_io_uring_setup, + entries as libc::c_uint, + &mut params as *mut IoUringParams, + ) + })? as RawFd; + + let sq_ring_size = + params.sq_off.array as usize + params.sq_entries as usize * std::mem::size_of::(); + let cq_ring_size = params.cq_off.cqes as usize + + params.cq_entries as usize * std::mem::size_of::(); + let single_mmap = params.features & IORING_FEAT_SINGLE_MMAP != 0; + + let sq_ring_ptr = mmap_ring( + if single_mmap { + sq_ring_size.max(cq_ring_size) + } else { + sq_ring_size + }, + ring_fd, + IORING_OFF_SQ_RING, + )?; + let cq_ring_ptr = if single_mmap { + sq_ring_ptr + } else { + mmap_ring(cq_ring_size, ring_fd, IORING_OFF_CQ_RING)? + }; + let sqes_size = params.sq_entries as usize * std::mem::size_of::(); + let sqes_ptr = mmap_ring(sqes_size, ring_fd, IORING_OFF_SQES)? as *mut IoUringSqe; + + Ok(Self { + ring_fd, + sq_ring_ptr, + cq_ring_ptr, + sqes_ptr, + sq_ring_size, + cq_ring_size, + sqes_size, + single_mmap, + sq_head: offset_ptr(sq_ring_ptr, params.sq_off.head), + sq_tail: offset_ptr(sq_ring_ptr, params.sq_off.tail), + sq_ring_mask: offset_ptr(sq_ring_ptr, params.sq_off.ring_mask), + sq_ring_entries: offset_ptr(sq_ring_ptr, params.sq_off.ring_entries), + sq_array: offset_ptr(sq_ring_ptr, params.sq_off.array), + cq_head: offset_ptr(cq_ring_ptr, params.cq_off.head), + cq_tail: offset_ptr(cq_ring_ptr, params.cq_off.tail), + cq_ring_mask: offset_ptr(cq_ring_ptr, params.cq_off.ring_mask), + cqes: offset_ptr(cq_ring_ptr, params.cq_off.cqes), + }) + } + + pub(crate) fn ring_fd(&self) -> RawFd { + self.ring_fd + } + + pub(crate) fn bind_current_thread(&self) { + CURRENT_SUBMITTER.with(|submitter| submitter.set(self as *const Self)); + } + + pub(crate) fn unbind_current_thread(&self) { + CURRENT_SUBMITTER.with(|submitter| { + if ptr::eq(submitter.get(), self) { + submitter.set(ptr::null()); + } + }); + } + + pub(crate) fn with_submitter(f: impl FnOnce(&IoUring) -> io::Result) -> io::Result { + CURRENT_SUBMITTER.with(|submitter| { + let ptr = submitter.get(); + if !ptr.is_null() { + let ring = unsafe { &*ptr }; + return f(ring); + } + + let mut ring = global_submitter() + .lock() + .expect("global io_uring submitter should not be poisoned"); + if ring.is_none() { + *ring = Some(IoUring::new(64)?); + } + + f(ring + .as_ref() + .expect("global submitter ring should initialize")) + }) + } + + pub(crate) fn submit_timeout(&self, token: u64, deadline: Duration) -> io::Result<()> { + let timespec = duration_to_kernel_timespec(deadline); + self.push_sqe(|sqe| { + sqe.opcode = IORING_OP_TIMEOUT; + sqe.fd = -1; + sqe.off = 0; + sqe.user_data = token; + sqe.addr = (×pec as *const KernelTimespec) as u64; + sqe.len = 1; + sqe.op_flags = IORING_TIMEOUT_ABS; + })?; + self.submit_pending().map(|_| ()) + } + + pub(crate) fn submit_timeout_remove( + &self, + token_to_remove: u64, + completion: u64, + ) -> io::Result<()> { + self.push_sqe(|sqe| { + sqe.opcode = IORING_OP_TIMEOUT_REMOVE; + sqe.fd = -1; + sqe.flags = IOSQE_CQE_SKIP_SUCCESS; + sqe.user_data = completion; + sqe.addr = token_to_remove; + })?; + self.submit_pending().map(|_| ()) + } + + pub(crate) fn submit_timeout_update( + &self, + token_to_update: u64, + deadline: Duration, + ) -> io::Result<()> { + let timespec = duration_to_kernel_timespec(deadline); + self.push_sqe(|sqe| { + sqe.opcode = IORING_OP_TIMEOUT_REMOVE; + sqe.fd = -1; + sqe.off = (×pec as *const KernelTimespec) as u64; + sqe.addr = token_to_update; + sqe.op_flags = IORING_TIMEOUT_UPDATE | IORING_TIMEOUT_ABS; + })?; + self.submit_pending().map(|_| ()) + } + + pub(crate) fn submit_msg_ring( + &self, + target_ring_fd: RawFd, + target_user_data: u64, + value: u32, + completion: u64, + ) -> io::Result<()> { + self.push_sqe(|sqe| { + sqe.opcode = IORING_OP_MSG_RING; + sqe.flags = IOSQE_CQE_SKIP_SUCCESS; + sqe.fd = target_ring_fd; + sqe.off = target_user_data; + sqe.addr = IORING_MSG_DATA; + sqe.len = value; + sqe.user_data = completion; + })?; + self.submit_pending().map(|_| ()) + } + + pub(crate) fn submit_with_token( + &self, + token: u64, + fill: impl FnOnce(&mut IoUringSqe), + ) -> io::Result<()> { + self.push_sqe(|sqe| { + fill(sqe); + sqe.user_data = token; + })?; + self.submit_pending().map(|_| ()) + } + + pub(crate) fn drain_completions(&self, mut f: impl FnMut(IoUringCqe)) -> bool { + let mut head = load_u32(self.cq_head); + let tail = load_u32(self.cq_tail); + if head == tail { + return false; + } + let mask = load_u32(self.cq_ring_mask); + + while head != tail { + let index = (head & mask) as usize; + let cqe = unsafe { ptr::read_volatile(self.cqes.add(index)) }; + f(cqe); + head = head.wrapping_add(1); + } + + store_u32(self.cq_head, head); + true + } + + pub(crate) fn wait_for_cqe(&self) -> io::Result<()> { + loop { + match self.enter(0, 1, IORING_ENTER_GETEVENTS) { + Ok(_) => return Ok(()), + Err(error) if error.kind() == io::ErrorKind::Interrupted => continue, + Err(error) => return Err(error), + } + } + } + + fn push_sqe(&self, fill: impl FnOnce(&mut IoUringSqe)) -> io::Result<()> { + let head = load_u32(self.sq_head); + let tail = load_u32(self.sq_tail); + let entries = load_u32(self.sq_ring_entries); + if tail.wrapping_sub(head) >= entries { + self.submit_pending()?; + let head = load_u32(self.sq_head); + let tail = load_u32(self.sq_tail); + if tail.wrapping_sub(head) >= entries { + return Err(io::Error::new( + io::ErrorKind::WouldBlock, + "io_uring submission queue is full", + )); + } + } + + let tail = load_u32(self.sq_tail); + let mask = load_u32(self.sq_ring_mask); + let index = (tail & mask) as usize; + let sqe = unsafe { &mut *self.sqes_ptr.add(index) }; + *sqe = IoUringSqe::default(); + fill(sqe); + unsafe { + ptr::write_volatile(self.sq_array.add(index), index as u32); + } + compiler_fence(Ordering::Release); + store_u32(self.sq_tail, tail.wrapping_add(1)); + Ok(()) + } + + fn submit_pending(&self) -> io::Result { + let head = load_u32(self.sq_head); + let tail = load_u32(self.sq_tail); + let to_submit = tail.wrapping_sub(head); + if to_submit == 0 { + return Ok(0); + } + self.enter(to_submit, 0, 0) + } + + fn enter(&self, to_submit: u32, min_complete: u32, flags: u32) -> io::Result { + cvt_long(unsafe { + libc::syscall( + libc::SYS_io_uring_enter, + self.ring_fd, + to_submit as libc::c_uint, + min_complete as libc::c_uint, + flags as libc::c_uint, + ptr::null::(), + 0usize, + ) + }) + .map(|value| value as u32) + } +} + +impl Drop for IoUring { + fn drop(&mut self) { + unsafe { + libc::munmap(self.sqes_ptr.cast(), self.sqes_size); + if self.single_mmap { + libc::munmap( + self.sq_ring_ptr.cast(), + self.sq_ring_size.max(self.cq_ring_size), + ); + } else { + libc::munmap(self.sq_ring_ptr.cast(), self.sq_ring_size); + libc::munmap(self.cq_ring_ptr.cast(), self.cq_ring_size); + } + libc::close(self.ring_fd); + } + } +} + +unsafe impl Send for IoUring {} + +fn offset_ptr(base: *mut u8, offset: u32) -> *mut T { + unsafe { base.add(offset as usize).cast::() } +} + +fn mmap_ring(length: usize, fd: RawFd, offset: libc::off_t) -> io::Result<*mut u8> { + let ptr = unsafe { + libc::mmap( + ptr::null_mut(), + length, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_SHARED | libc::MAP_POPULATE, + fd, + offset, + ) + }; + if ptr == libc::MAP_FAILED { + Err(io::Error::last_os_error()) + } else { + Ok(ptr.cast()) + } +} + +fn load_u32(ptr: *const u32) -> u32 { + let value = unsafe { ptr::read_volatile(ptr) }; + compiler_fence(Ordering::Acquire); + value +} + +fn store_u32(ptr: *mut u32, value: u32) { + compiler_fence(Ordering::Release); + unsafe { + ptr::write_volatile(ptr, value); + } +} + +fn cvt_long(result: libc::c_long) -> io::Result { + if result == -1 { + Err(io::Error::last_os_error()) + } else { + Ok(result) + } +} + +fn global_submitter() -> &'static Mutex> { + GLOBAL_SUBMITTER.get_or_init(|| Mutex::new(None)) +} + +fn duration_to_kernel_timespec(duration: Duration) -> KernelTimespec { + KernelTimespec { + tv_sec: duration.as_secs() as i64, + tv_nsec: duration.subsec_nanos() as i64, + } +} diff --git a/lib/runtime/src/platform/mod.rs b/lib/runtime/src/platform/mod.rs new file mode 100644 index 0000000..2897c2b --- /dev/null +++ b/lib/runtime/src/platform/mod.rs @@ -0,0 +1,2 @@ +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub mod linux_x86_64; diff --git a/lib/runtime/src/sys/linux/channel.rs b/lib/runtime/src/sys/linux/channel.rs new file mode 100644 index 0000000..dd9ce81 --- /dev/null +++ b/lib/runtime/src/sys/linux/channel.rs @@ -0,0 +1,10 @@ +//! Linux channel wake helpers. + +use crate::op::completion::{CompletionFuture, CompletionHandle, completion}; +use crate::platform::linux_x86_64::runtime::try_current_thread_handle; + +pub(crate) fn runtime_waiter() -> (CompletionFuture, CompletionHandle) { + let owner = try_current_thread_handle() + .expect("async channel operations must be polled on a runtime thread"); + completion(owner) +} diff --git a/lib/runtime/src/sys/linux/fs.rs b/lib/runtime/src/sys/linux/fs.rs new file mode 100644 index 0000000..84e39da --- /dev/null +++ b/lib/runtime/src/sys/linux/fs.rs @@ -0,0 +1,586 @@ +//! Linux filesystem backend. + +use std::collections::VecDeque; +use std::ffi::CString; +use std::future::poll_fn; +use std::io; +use std::mem::MaybeUninit; +use std::os::fd::{FromRawFd, OwnedFd, RawFd}; +use std::os::unix::ffi::OsStrExt; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll, Waker}; +use std::thread; + +use crate::op::completion::completion_for_current_thread; +use crate::op::fs::{FileType, FsOp, MetadataTarget, OpenOptions, RawDirEntry, RawMetadata}; +use crate::platform::linux_x86_64::runtime::{ + ThreadHandle, current_thread_handle, with_current_reactor, +}; +use crate::platform::linux_x86_64::uring::{ + IORING_FSYNC_DATASYNC, IORING_OP_CLOSE, IORING_OP_FSYNC, IORING_OP_FTRUNCATE, + IORING_OP_MKDIRAT, IORING_OP_OPENAT, IORING_OP_READ, IORING_OP_RENAMEAT, IORING_OP_STATX, + IORING_OP_UNLINKAT, IORING_OP_WRITE, IoUringCqe, +}; + +const STATX_BASIC_MASK: u32 = + libc::STATX_TYPE | libc::STATX_MODE | libc::STATX_SIZE | libc::STATX_NLINK; +const FILE_CURSOR: u64 = u64::MAX; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ExecutionPath { + IoUring, + Offload, +} + +pub fn execution_path(op: &FsOp) -> ExecutionPath { + match op { + FsOp::ReadDir { .. } | FsOp::Duplicate { .. } => ExecutionPath::Offload, + FsOp::Open { .. } + | FsOp::Read { .. } + | FsOp::Write { .. } + | FsOp::Metadata { .. } + | FsOp::SetLen { .. } + | FsOp::SyncAll { .. } + | FsOp::SyncData { .. } + | FsOp::CreateDir { .. } + | FsOp::RemoveFile { .. } + | FsOp::RemoveDir { .. } + | FsOp::Rename { .. } + | FsOp::Close { .. } => ExecutionPath::IoUring, + } +} + +pub async fn open(op: FsOp) -> io::Result { + let FsOp::Open { path, options } = op else { + unreachable!("open backend called with non-open op"); + }; + + let path = path_to_c_string(&path)?; + let path_ptr = path.as_ptr(); + let (flags, mode) = open_flags(&options)?; + submit_uring::( + move |sqe| { + sqe.opcode = IORING_OP_OPENAT; + sqe.fd = libc::AT_FDCWD; + sqe.addr = path_ptr as u64; + sqe.len = mode; + sqe.op_flags = flags as u32; + }, + move |cqe| { + let _path = path; + cqe_to_result(cqe).map(|fd| unsafe { OwnedFd::from_raw_fd(fd as RawFd) }) + }, + ) + .await +} + +pub async fn read(op: FsOp) -> io::Result> { + let FsOp::Read { fd, offset, len } = op else { + unreachable!("read backend called with non-read op"); + }; + + let mut buffer = vec![0; len]; + let buffer_ptr = buffer.as_mut_ptr(); + let buffer_len = buffer.len(); + submit_uring::, _>( + move |sqe| { + sqe.opcode = IORING_OP_READ; + sqe.fd = fd; + sqe.addr = buffer_ptr as u64; + sqe.len = buffer_len as u32; + sqe.off = offset.unwrap_or(FILE_CURSOR); + }, + move |cqe| { + let read = cqe_to_result(cqe)? as usize; + buffer.truncate(read); + Ok(buffer) + }, + ) + .await +} + +pub async fn write(op: FsOp) -> io::Result { + let FsOp::Write { fd, offset, data } = op else { + unreachable!("write backend called with non-write op"); + }; + let data_ptr = data.as_ptr(); + let data_len = data.len(); + + submit_uring::( + move |sqe| { + sqe.opcode = IORING_OP_WRITE; + sqe.fd = fd; + sqe.addr = data_ptr as u64; + sqe.len = data_len as u32; + sqe.off = offset.unwrap_or(FILE_CURSOR); + }, + move |cqe| { + let _data = data; + cqe_to_result(cqe).map(|written| written as usize) + }, + ) + .await +} + +pub async fn metadata(op: FsOp) -> io::Result { + let FsOp::Metadata { + target, + follow_symlinks, + } = op + else { + unreachable!("metadata backend called with non-metadata op"); + }; + + let mut statx = Box::new(MaybeUninit::::zeroed()); + let statx_ptr = statx.as_mut_ptr(); + let (fd, path, flags) = match target { + MetadataTarget::Path(path) => ( + libc::AT_FDCWD, + path_to_c_string(&path)?, + metadata_flags(follow_symlinks), + ), + MetadataTarget::File(fd) => ( + fd, + CString::new(Vec::::new()).expect("empty statx path should be valid"), + libc::AT_EMPTY_PATH, + ), + }; + let path_ptr = path.as_ptr(); + + submit_uring::( + move |sqe| { + sqe.opcode = IORING_OP_STATX; + sqe.fd = fd; + sqe.addr = path_ptr as u64; + sqe.len = STATX_BASIC_MASK; + sqe.off = statx_ptr as u64; + sqe.op_flags = flags as u32; + }, + move |cqe| { + let _path = path; + cqe_to_result(cqe)?; + let statx = unsafe { statx.assume_init() }; + Ok(raw_metadata_from_statx(&statx)) + }, + ) + .await +} + +pub async fn sync_all(op: FsOp) -> io::Result<()> { + let FsOp::SyncAll { fd } = op else { + unreachable!("sync_all backend called with non-sync_all op"); + }; + + submit_sync(fd, 0).await +} + +pub async fn sync_data(op: FsOp) -> io::Result<()> { + let FsOp::SyncData { fd } = op else { + unreachable!("sync_data backend called with non-sync_data op"); + }; + + submit_sync(fd, IORING_FSYNC_DATASYNC).await +} + +pub async fn set_len(op: FsOp) -> io::Result<()> { + let FsOp::SetLen { fd, len } = op else { + unreachable!("set_len backend called with non-set_len op"); + }; + + submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_FTRUNCATE; + sqe.fd = fd; + sqe.off = len; + }, + move |cqe| cqe_to_result(cqe).map(|_| ()), + ) + .await +} + +pub async fn try_clone(op: FsOp) -> io::Result { + let FsOp::Duplicate { fd } = op else { + unreachable!("try_clone backend called with non-duplicate op"); + }; + + offload(move || { + let duplicated = cvt(unsafe { libc::fcntl(fd, libc::F_DUPFD_CLOEXEC, 0) })?; + Ok(unsafe { OwnedFd::from_raw_fd(duplicated) }) + }) + .await +} + +pub async fn create_dir(op: FsOp) -> io::Result<()> { + let FsOp::CreateDir { + path, + recursive: _, + mode, + } = op + else { + unreachable!("create_dir backend called with non-create_dir op"); + }; + + let path = path_to_c_string(&path)?; + let path_ptr = path.as_ptr(); + submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_MKDIRAT; + sqe.fd = libc::AT_FDCWD; + sqe.addr = path_ptr as u64; + sqe.len = mode; + }, + move |cqe| { + let _path = path; + cqe_to_result(cqe).map(|_| ()) + }, + ) + .await +} + +pub async fn remove_file(op: FsOp) -> io::Result<()> { + let FsOp::RemoveFile { path } = op else { + unreachable!("remove_file backend called with non-remove_file op"); + }; + + submit_unlink(path, 0).await +} + +pub async fn remove_dir(op: FsOp) -> io::Result<()> { + let FsOp::RemoveDir { path } = op else { + unreachable!("remove_dir backend called with non-remove_dir op"); + }; + + submit_unlink(path, libc::AT_REMOVEDIR).await +} + +pub async fn rename(op: FsOp) -> io::Result<()> { + let FsOp::Rename { from, to } = op else { + unreachable!("rename backend called with non-rename op"); + }; + + let from = path_to_c_string(&from)?; + let to = path_to_c_string(&to)?; + let from_ptr = from.as_ptr(); + let to_ptr = to.as_ptr(); + submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_RENAMEAT; + sqe.fd = libc::AT_FDCWD; + sqe.addr = from_ptr as u64; + sqe.len = libc::AT_FDCWD as u32; + sqe.off = to_ptr as u64; + sqe.op_flags = 0; + }, + move |cqe| { + let _from = from; + let _to = to; + cqe_to_result(cqe).map(|_| ()) + }, + ) + .await +} + +pub async fn close(op: FsOp) -> io::Result<()> { + let FsOp::Close { fd } = op else { + unreachable!("close backend called with non-close op"); + }; + + submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_CLOSE; + sqe.fd = fd; + }, + move |cqe| cqe_to_result(cqe).map(|_| ()), + ) + .await +} + +pub fn read_dir(op: FsOp) -> io::Result { + let FsOp::ReadDir { path } = op else { + unreachable!("read_dir backend called with non-read_dir op"); + }; + + ReadDirStream::new(path) +} + +pub struct ReadDirStream { + state: Arc, +} + +impl ReadDirStream { + fn new(path: PathBuf) -> io::Result { + let state = Arc::new(ReadDirState::new(current_thread_handle())); + let producer = Arc::clone(&state); + + thread::Builder::new() + .name("ruin-runtime-read-dir".into()) + .spawn(move || produce_dir_entries(path, producer)) + .map_err(io::Error::other)?; + + Ok(Self { state }) + } + + pub async fn next_entry(&mut self) -> io::Result> { + poll_fn(|cx| self.state.poll_next(cx)).await + } +} + +struct ReadDirState { + owner: ThreadHandle, + queue: Mutex>>, + done: AtomicBool, + pending: AtomicBool, + wake_queued: AtomicBool, + waker: Mutex>, +} + +impl ReadDirState { + fn new(owner: ThreadHandle) -> Self { + owner.begin_async_operation(); + Self { + owner, + queue: Mutex::new(VecDeque::new()), + done: AtomicBool::new(false), + pending: AtomicBool::new(true), + wake_queued: AtomicBool::new(false), + waker: Mutex::new(None), + } + } + + fn push(self: &Arc, entry: io::Result) { + self.queue.lock().unwrap().push_back(entry); + self.notify(); + } + + fn finish(self: &Arc) { + self.done.store(true, Ordering::Release); + self.release_pending(); + self.notify(); + } + + fn release_pending(&self) { + if self.pending.swap(false, Ordering::AcqRel) { + self.owner.finish_async_operation(); + } + } + + fn notify(self: &Arc) { + if self.wake_queued.swap(true, Ordering::AcqRel) { + return; + } + + let state = Arc::clone(self); + if !self.owner.queue_microtask(move || { + state.wake_queued.store(false, Ordering::Release); + if let Some(waker) = state.waker.lock().unwrap().take() { + waker.wake(); + } + }) { + self.wake_queued.store(false, Ordering::Release); + } + } + + fn poll_next(&self, cx: &mut Context<'_>) -> Poll>> { + if let Some(entry) = self.queue.lock().unwrap().pop_front() { + return Poll::Ready(entry.map(Some)); + } + + if self.done.load(Ordering::Acquire) { + return Poll::Ready(Ok(None)); + } + + *self.waker.lock().unwrap() = Some(cx.waker().clone()); + + if let Some(entry) = self.queue.lock().unwrap().pop_front() { + let _ = self.waker.lock().unwrap().take(); + return Poll::Ready(entry.map(Some)); + } + + if self.done.load(Ordering::Acquire) { + let _ = self.waker.lock().unwrap().take(); + return Poll::Ready(Ok(None)); + } + + Poll::Pending + } +} + +impl Drop for ReadDirStream { + fn drop(&mut self) { + self.state.release_pending(); + } +} + +fn produce_dir_entries(path: PathBuf, state: Arc) { + match std::fs::read_dir(path) { + Ok(entries) => { + for entry in entries { + match entry { + Ok(entry) => { + let file_name = entry.file_name(); + state.push(Ok(RawDirEntry { + path: entry.path(), + file_name, + })); + } + Err(error) => state.push(Err(error)), + } + } + } + Err(error) => state.push(Err(error)), + } + + state.finish(); +} + +async fn submit_sync(fd: RawFd, flags: u32) -> io::Result<()> { + submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_FSYNC; + sqe.fd = fd; + sqe.op_flags = flags; + }, + move |cqe| cqe_to_result(cqe).map(|_| ()), + ) + .await +} + +async fn submit_unlink(path: PathBuf, flags: i32) -> io::Result<()> { + let path = path_to_c_string(&path)?; + let path_ptr = path.as_ptr(); + submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_UNLINKAT; + sqe.fd = libc::AT_FDCWD; + sqe.addr = path_ptr as u64; + sqe.op_flags = flags as u32; + }, + move |cqe| { + let _path = path; + cqe_to_result(cqe).map(|_| ()) + }, + ) + .await +} + +async fn submit_uring( + fill: impl FnOnce(&mut crate::platform::linux_x86_64::uring::IoUringSqe), + map: M, +) -> io::Result +where + M: FnOnce(IoUringCqe) -> io::Result + Send + 'static, +{ + let (future, handle) = completion_for_current_thread::>(); + let callback_handle = handle.clone(); + let token = with_current_reactor(|reactor| { + reactor.submit_operation(fill, move |cqe| { + callback_handle.complete(map(cqe)); + }) + })?; + + handle.set_cancel(move || { + let _ = with_current_reactor(|reactor| reactor.cancel_operation(token)); + }); + + future.await +} + +async fn offload( + task: impl FnOnce() -> io::Result + Send + 'static, +) -> io::Result { + let (future, handle) = completion_for_current_thread::>(); + thread::Builder::new() + .name("ruin-runtime-fs-offload".into()) + .spawn(move || handle.complete(task())) + .map_err(io::Error::other)?; + future.await +} + +fn path_to_c_string(path: &Path) -> io::Result { + CString::new(path.as_os_str().as_bytes()).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + "paths containing NUL bytes are not supported", + ) + }) +} + +fn open_flags(options: &OpenOptions) -> io::Result<(i32, u32)> { + if !options.read && !options.write && !options.append { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "OpenOptions requires read, write, or append access", + )); + } + + let mut flags = if options.read { + if options.write || options.append { + libc::O_RDWR + } else { + libc::O_RDONLY + } + } else { + libc::O_WRONLY + }; + + if options.append { + flags |= libc::O_APPEND; + } + if options.truncate { + flags |= libc::O_TRUNC; + } + if options.create_new { + flags |= libc::O_CREAT | libc::O_EXCL; + } else if options.create { + flags |= libc::O_CREAT; + } + + Ok((flags | libc::O_CLOEXEC, 0o666)) +} + +fn metadata_flags(follow_symlinks: bool) -> i32 { + let mut flags = libc::AT_NO_AUTOMOUNT; + if !follow_symlinks { + flags |= libc::AT_SYMLINK_NOFOLLOW; + } + flags +} + +fn raw_metadata_from_statx(statx: &libc::statx) -> RawMetadata { + RawMetadata { + file_type: file_type_from_mode(statx.stx_mode), + mode: statx.stx_mode, + len: statx.stx_size, + } +} + +fn file_type_from_mode(mode: u16) -> FileType { + match mode & libc::S_IFMT as u16 { + value if value == libc::S_IFREG as u16 => FileType::File, + value if value == libc::S_IFDIR as u16 => FileType::Directory, + value if value == libc::S_IFLNK as u16 => FileType::Symlink, + value if value == libc::S_IFBLK as u16 => FileType::BlockDevice, + value if value == libc::S_IFCHR as u16 => FileType::CharacterDevice, + value if value == libc::S_IFIFO as u16 => FileType::Fifo, + value if value == libc::S_IFSOCK as u16 => FileType::Socket, + _ => FileType::Unknown, + } +} + +fn cqe_to_result(cqe: IoUringCqe) -> io::Result { + if cqe.res < 0 { + Err(io::Error::from_raw_os_error(-cqe.res)) + } else { + Ok(cqe.res) + } +} + +fn cvt(value: libc::c_int) -> io::Result { + if value == -1 { + Err(io::Error::last_os_error()) + } else { + Ok(value) + } +} diff --git a/lib/runtime/src/sys/linux/mod.rs b/lib/runtime/src/sys/linux/mod.rs new file mode 100644 index 0000000..8672d2a --- /dev/null +++ b/lib/runtime/src/sys/linux/mod.rs @@ -0,0 +1,5 @@ +//! Linux backend modules. + +pub mod channel; +pub mod fs; +pub mod net; diff --git a/lib/runtime/src/sys/linux/net.rs b/lib/runtime/src/sys/linux/net.rs new file mode 100644 index 0000000..85111dc --- /dev/null +++ b/lib/runtime/src/sys/linux/net.rs @@ -0,0 +1,974 @@ +//! Linux networking backend. + +use std::ffi::c_void; +use std::future::Future; +use std::io; +use std::mem::MaybeUninit; +use std::net::{ + Ipv4Addr, Ipv6Addr, Shutdown, SocketAddr, SocketAddrV4, SocketAddrV6, ToSocketAddrs, +}; +use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::pin::Pin; +use std::thread; +use std::time::Duration; + +use crate::op::completion::completion_for_current_thread; +use crate::op::net::{AcceptedSocket, NetOp, ReceivedDatagram}; +use crate::platform::linux_x86_64::runtime::with_current_reactor; +use crate::platform::linux_x86_64::uring::{ + IORING_OP_ACCEPT, IORING_OP_BIND, IORING_OP_CLOSE, IORING_OP_CONNECT, IORING_OP_LISTEN, + IORING_OP_RECV, IORING_OP_SEND, IORING_OP_SHUTDOWN, IORING_OP_SOCKET, IoUringCqe, IoUringSqe, +}; + +const DEFAULT_LISTENER_BACKLOG: i32 = 1024; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ExecutionPath { + IoUring, + Offload, +} + +pub fn execution_path(op: &NetOp) -> ExecutionPath { + match op { + NetOp::Socket { .. } + | NetOp::Connect { .. } + | NetOp::Bind { .. } + | NetOp::Listen { .. } + | NetOp::Accept { .. } + | NetOp::Send { .. } + | NetOp::Recv { .. } + | NetOp::Shutdown { .. } + | NetOp::Close { .. } => ExecutionPath::IoUring, + NetOp::SendTo { .. } | NetOp::RecvFrom { .. } => ExecutionPath::Offload, + } +} + +pub async fn resolve_addrs(addr: A) -> io::Result> +where + A: ToSocketAddrs + Send + 'static, +{ + offload(move || { + let addrs = addr.to_socket_addrs()?.collect::>(); + if addrs.is_empty() { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + "address resolved to no socket addresses", + )) + } else { + Ok(addrs) + } + }) + .await +} + +pub async fn socket(op: NetOp) -> io::Result { + let NetOp::Socket { + domain, + socket_type, + protocol, + flags, + } = op + else { + unreachable!("socket backend called with non-socket op"); + }; + + match submit_uring::( + move |sqe| { + sqe.opcode = IORING_OP_SOCKET; + sqe.fd = domain; + sqe.off = socket_type as u64; + sqe.len = protocol as u32; + sqe.op_flags = flags; + }, + move |cqe| cqe_to_result(cqe).map(|fd| unsafe { OwnedFd::from_raw_fd(fd as RawFd) }), + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => { + offload(move || socket_sync(domain, socket_type, protocol, flags)).await + } + result => result, + } +} + +pub async fn connect(op: NetOp) -> io::Result<()> { + let NetOp::Connect { fd, addr } = op else { + unreachable!("connect backend called with non-connect op"); + }; + + let raw_addr = RawSocketAddr::from_socket_addr(addr); + let fallback_addr = raw_addr; + let addr_ptr = raw_addr.as_ptr(); + let addr_len = raw_addr.len(); + match submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_CONNECT; + sqe.fd = fd; + sqe.addr = addr_ptr as u64; + sqe.off = addr_len as u64; + }, + move |cqe| { + let _raw_addr = raw_addr; + cqe_to_result(cqe).map(|_| ()) + }, + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => { + offload(move || connect_sync(fd, fallback_addr)).await + } + result => result, + } +} + +pub async fn bind(op: NetOp) -> io::Result<()> { + let NetOp::Bind { fd, addr } = op else { + unreachable!("bind backend called with non-bind op"); + }; + + let raw_addr = RawSocketAddr::from_socket_addr(addr); + let fallback_addr = raw_addr; + let addr_ptr = raw_addr.as_ptr(); + let addr_len = raw_addr.len(); + match submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_BIND; + sqe.fd = fd; + sqe.addr = addr_ptr as u64; + sqe.off = addr_len as u64; + }, + move |cqe| { + let _raw_addr = raw_addr; + cqe_to_result(cqe).map(|_| ()) + }, + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => { + offload(move || bind_sync(fd, fallback_addr)).await + } + result => result, + } +} + +pub async fn listen(op: NetOp) -> io::Result<()> { + let NetOp::Listen { fd, backlog } = op else { + unreachable!("listen backend called with non-listen op"); + }; + + match submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_LISTEN; + sqe.fd = fd; + sqe.len = backlog as u32; + }, + move |cqe| cqe_to_result(cqe).map(|_| ()), + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => { + offload(move || listen_sync(fd, backlog)).await + } + result => result, + } +} + +pub async fn accept(op: NetOp) -> io::Result { + let NetOp::Accept { fd } = op else { + unreachable!("accept backend called with non-accept op"); + }; + + let mut storage = Box::new(MaybeUninit::::zeroed()); + let mut addr_len = Box::new(std::mem::size_of::() as libc::socklen_t); + let storage_ptr = storage.as_mut_ptr(); + let addr_len_ptr = addr_len.as_mut() as *mut libc::socklen_t; + + match submit_uring::( + move |sqe| { + sqe.opcode = IORING_OP_ACCEPT; + sqe.fd = fd; + sqe.addr = storage_ptr as u64; + sqe.off = addr_len_ptr as u64; + }, + move |cqe| { + let accepted_fd = cqe_to_result(cqe)? as RawFd; + let storage = unsafe { storage.assume_init() }; + let peer_addr = socket_addr_from_storage(&storage, *addr_len)?; + Ok(AcceptedSocket { + fd: accepted_fd, + peer_addr, + }) + }, + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => offload(move || accept_sync(fd)).await, + result => result, + } +} + +pub async fn send(op: NetOp) -> io::Result { + let NetOp::Send { fd, data, flags } = op else { + unreachable!("send backend called with non-send op"); + }; + + let fallback_data = data.clone(); + let data_ptr = data.as_ptr(); + let data_len = data.len(); + match submit_uring::( + move |sqe| { + sqe.opcode = IORING_OP_SEND; + sqe.fd = fd; + sqe.addr = data_ptr as u64; + sqe.len = data_len as u32; + sqe.op_flags = flags as u32; + }, + move |cqe| { + let _data = data; + cqe_to_result(cqe).map(|written| written as usize) + }, + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => { + offload(move || send_sync(fd, fallback_data, flags)).await + } + result => result, + } +} + +pub async fn send_to(op: NetOp) -> io::Result { + let NetOp::SendTo { + fd, + target, + data, + flags, + } = op + else { + unreachable!("send_to backend called with non-send_to op"); + }; + + let raw_addr = RawSocketAddr::from_socket_addr(target); + offload(move || send_to_sync(fd, data, raw_addr, flags)).await +} + +pub async fn recv(op: NetOp) -> io::Result> { + let NetOp::Recv { fd, len, flags } = op else { + unreachable!("recv backend called with non-recv op"); + }; + + let mut buffer = vec![0; len]; + let buffer_ptr = buffer.as_mut_ptr(); + let buffer_len = buffer.len(); + match submit_uring::, _>( + move |sqe| { + sqe.opcode = IORING_OP_RECV; + sqe.fd = fd; + sqe.addr = buffer_ptr as u64; + sqe.len = buffer_len as u32; + sqe.op_flags = flags as u32; + }, + move |cqe| { + let read = cqe_to_result(cqe)? as usize; + buffer.truncate(read); + Ok(buffer) + }, + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => { + offload(move || recv_sync(fd, len, flags)).await + } + result => result, + } +} + +pub async fn recv_from(op: NetOp) -> io::Result { + let NetOp::RecvFrom { fd, len, flags } = op else { + unreachable!("recv_from backend called with non-recv_from op"); + }; + + offload(move || recv_from_sync(fd, len, flags)).await +} + +pub async fn shutdown(op: NetOp) -> io::Result<()> { + let NetOp::Shutdown { fd, how } = op else { + unreachable!("shutdown backend called with non-shutdown op"); + }; + + let fallback_how = how; + match submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_SHUTDOWN; + sqe.fd = fd; + sqe.len = shutdown_how(how) as u32; + }, + move |cqe| cqe_to_result(cqe).map(|_| ()), + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => { + offload(move || shutdown_sync(fd, fallback_how)).await + } + result => result, + } +} + +pub async fn close(op: NetOp) -> io::Result<()> { + let NetOp::Close { fd } = op else { + unreachable!("close backend called with non-close op"); + }; + + match submit_uring::<(), _>( + move |sqe| { + sqe.opcode = IORING_OP_CLOSE; + sqe.fd = fd; + }, + move |cqe| cqe_to_result(cqe).map(|_| ()), + ) + .await + { + Err(error) if should_fallback_to_offload(&error) => offload(move || close_sync(fd)).await, + result => result, + } +} + +pub async fn connect_stream(addr: SocketAddr) -> io::Result { + let socket = socket(NetOp::Socket { + domain: socket_domain(addr), + socket_type: libc::SOCK_STREAM, + protocol: 0, + flags: libc::SOCK_CLOEXEC as u32, + }) + .await?; + + let connect_result = connect(NetOp::Connect { + fd: socket.as_raw_fd(), + addr, + }) + .await; + match connect_result { + Ok(()) => Ok(socket), + Err(error) => Err(error), + } +} + +pub async fn bind_listener(addr: SocketAddr, backlog: Option) -> io::Result { + let listener = socket(NetOp::Socket { + domain: socket_domain(addr), + socket_type: libc::SOCK_STREAM, + protocol: 0, + flags: libc::SOCK_CLOEXEC as u32, + }) + .await?; + + set_reuse_addr(listener.as_raw_fd(), true)?; + + bind(NetOp::Bind { + fd: listener.as_raw_fd(), + addr, + }) + .await?; + listen(NetOp::Listen { + fd: listener.as_raw_fd(), + backlog: backlog.unwrap_or(DEFAULT_LISTENER_BACKLOG), + }) + .await?; + Ok(listener) +} + +pub async fn bind_datagram(addr: SocketAddr) -> io::Result { + let socket = socket(NetOp::Socket { + domain: socket_domain(addr), + socket_type: libc::SOCK_DGRAM, + protocol: 0, + flags: libc::SOCK_CLOEXEC as u32, + }) + .await?; + + bind(NetOp::Bind { + fd: socket.as_raw_fd(), + addr, + }) + .await?; + Ok(socket) +} + +pub async fn duplicate(fd: RawFd) -> io::Result { + offload(move || { + let duplicated = cvt(unsafe { libc::fcntl(fd, libc::F_DUPFD_CLOEXEC, 0) })?; + Ok(unsafe { OwnedFd::from_raw_fd(duplicated) }) + }) + .await +} + +pub async fn recv_timeout( + fd: RawFd, + len: usize, + flags: i32, + timeout: Duration, +) -> io::Result> { + offload(move || { + wait_socket(fd, libc::POLLIN, timeout)?; + recv_sync(fd, len, flags) + }) + .await +} + +pub async fn send_timeout( + fd: RawFd, + data: Vec, + flags: i32, + timeout: Duration, +) -> io::Result { + offload(move || { + wait_socket(fd, libc::POLLOUT, timeout)?; + send_sync(fd, data, flags) + }) + .await +} + +pub async fn recv_from_timeout( + fd: RawFd, + len: usize, + flags: i32, + timeout: Duration, +) -> io::Result { + offload(move || { + wait_socket(fd, libc::POLLIN, timeout)?; + recv_from_sync(fd, len, flags) + }) + .await +} + +pub async fn send_to_timeout( + fd: RawFd, + data: Vec, + target: SocketAddr, + flags: i32, + timeout: Duration, +) -> io::Result { + offload(move || { + wait_socket(fd, libc::POLLOUT, timeout)?; + send_to_sync(fd, data, RawSocketAddr::from_socket_addr(target), flags) + }) + .await +} + +pub async fn connect_stream_timeout(addr: SocketAddr, timeout: Duration) -> io::Result { + offload(move || connect_stream_timeout_sync(addr, timeout)).await +} + +pub fn local_addr(fd: RawFd) -> io::Result { + socket_addr_with(libc::getsockname, fd) +} + +pub fn peer_addr(fd: RawFd) -> io::Result { + socket_addr_with(libc::getpeername, fd) +} + +pub fn nodelay(fd: RawFd) -> io::Result { + let mut value = 0; + let mut len = std::mem::size_of::() as libc::socklen_t; + cvt(unsafe { + libc::getsockopt( + fd, + libc::IPPROTO_TCP, + libc::TCP_NODELAY, + &mut value as *mut libc::c_int as *mut c_void, + &mut len, + ) + })?; + Ok(value != 0) +} + +pub fn broadcast(fd: RawFd) -> io::Result { + getsockopt_int(fd, libc::SOL_SOCKET, libc::SO_BROADCAST).map(|value| value != 0) +} + +pub fn set_broadcast(fd: RawFd, enabled: bool) -> io::Result<()> { + setsockopt_int(fd, libc::SOL_SOCKET, libc::SO_BROADCAST, enabled.into()) +} + +pub fn ttl(fd: RawFd) -> io::Result { + match socket_family(fd)? { + libc::AF_INET => { + getsockopt_int(fd, libc::IPPROTO_IP, libc::IP_TTL).map(|value| value as u32) + } + libc::AF_INET6 => getsockopt_int(fd, libc::IPPROTO_IPV6, libc::IPV6_UNICAST_HOPS) + .map(|value| value as u32), + family => Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unsupported socket family {family} for TTL"), + )), + } +} + +pub fn set_ttl(fd: RawFd, ttl: u32) -> io::Result<()> { + let ttl = i32::try_from(ttl) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "TTL exceeds i32 range"))?; + match socket_family(fd)? { + libc::AF_INET => setsockopt_int(fd, libc::IPPROTO_IP, libc::IP_TTL, ttl), + libc::AF_INET6 => setsockopt_int(fd, libc::IPPROTO_IPV6, libc::IPV6_UNICAST_HOPS, ttl), + family => Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unsupported socket family {family} for TTL"), + )), + } +} + +pub fn set_nodelay(fd: RawFd, enabled: bool) -> io::Result<()> { + let value: libc::c_int = enabled.into(); + cvt(unsafe { + libc::setsockopt( + fd, + libc::IPPROTO_TCP, + libc::TCP_NODELAY, + &value as *const libc::c_int as *const c_void, + std::mem::size_of_val(&value) as libc::socklen_t, + ) + }) + .map(|_| ()) +} + +pub type RecvFuture = Pin>> + 'static>>; +pub type SendFuture = Pin> + 'static>>; +pub type ShutdownFuture = Pin> + 'static>>; + +pub fn recv_future(fd: RawFd, len: usize) -> RecvFuture { + Box::pin(recv(NetOp::Recv { fd, len, flags: 0 })) +} + +pub fn send_future(fd: RawFd, data: Vec) -> SendFuture { + Box::pin(send(NetOp::Send { fd, data, flags: 0 })) +} + +pub fn shutdown_future(fd: RawFd, how: Shutdown) -> ShutdownFuture { + Box::pin(shutdown(NetOp::Shutdown { fd, how })) +} + +async fn submit_uring( + fill: impl FnOnce(&mut IoUringSqe), + map: M, +) -> io::Result +where + M: FnOnce(IoUringCqe) -> io::Result + Send + 'static, +{ + let (future, handle) = completion_for_current_thread::>(); + let callback_handle = handle.clone(); + let token = with_current_reactor(|reactor| { + reactor.submit_operation(fill, move |cqe| { + callback_handle.complete(map(cqe)); + }) + })?; + + handle.set_cancel(move || { + let _ = with_current_reactor(|reactor| reactor.cancel_operation(token)); + }); + + future.await +} + +async fn offload( + task: impl FnOnce() -> io::Result + Send + 'static, +) -> io::Result { + let (future, handle) = completion_for_current_thread::>(); + thread::Builder::new() + .name("ruin-runtime-net-offload".into()) + .spawn(move || handle.complete(task())) + .map_err(io::Error::other)?; + future.await +} + +fn socket_domain(addr: SocketAddr) -> i32 { + match addr { + SocketAddr::V4(_) => libc::AF_INET, + SocketAddr::V6(_) => libc::AF_INET6, + } +} + +fn shutdown_how(how: Shutdown) -> i32 { + match how { + Shutdown::Read => libc::SHUT_RD, + Shutdown::Write => libc::SHUT_WR, + Shutdown::Both => libc::SHUT_RDWR, + } +} + +fn socket_addr_with( + op: unsafe extern "C" fn(RawFd, *mut libc::sockaddr, *mut libc::socklen_t) -> libc::c_int, + fd: RawFd, +) -> io::Result { + let mut storage = MaybeUninit::::zeroed(); + let mut len = std::mem::size_of::() as libc::socklen_t; + cvt(unsafe { op(fd, storage.as_mut_ptr().cast::(), &mut len) })?; + let storage = unsafe { storage.assume_init() }; + socket_addr_from_storage(&storage, len) +} + +fn set_reuse_addr(fd: RawFd, enabled: bool) -> io::Result<()> { + setsockopt_int(fd, libc::SOL_SOCKET, libc::SO_REUSEADDR, enabled.into()) +} + +fn socket_family(fd: RawFd) -> io::Result { + let mut storage = MaybeUninit::::zeroed(); + let mut len = std::mem::size_of::() as libc::socklen_t; + cvt(unsafe { libc::getsockname(fd, storage.as_mut_ptr().cast::(), &mut len) })?; + let storage = unsafe { storage.assume_init() }; + Ok(storage.ss_family as i32) +} + +fn getsockopt_int(fd: RawFd, level: i32, name: i32) -> io::Result { + let mut value = 0; + let mut len = std::mem::size_of::() as libc::socklen_t; + cvt(unsafe { + libc::getsockopt( + fd, + level, + name, + &mut value as *mut libc::c_int as *mut c_void, + &mut len, + ) + })?; + Ok(value) +} + +fn setsockopt_int(fd: RawFd, level: i32, name: i32, value: i32) -> io::Result<()> { + cvt(unsafe { + libc::setsockopt( + fd, + level, + name, + &value as *const libc::c_int as *const c_void, + std::mem::size_of_val(&value) as libc::socklen_t, + ) + }) + .map(|_| ()) +} + +fn socket_addr_from_storage( + storage: &libc::sockaddr_storage, + len: libc::socklen_t, +) -> io::Result { + match storage.ss_family as i32 { + libc::AF_INET => { + if len < std::mem::size_of::() as libc::socklen_t { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "short IPv4 socket address from kernel", + )); + } + + let addr = unsafe { *(storage as *const _ as *const libc::sockaddr_in) }; + Ok(SocketAddr::V4(SocketAddrV4::new( + Ipv4Addr::from(addr.sin_addr.s_addr.to_ne_bytes()), + u16::from_be(addr.sin_port), + ))) + } + libc::AF_INET6 => { + if len < std::mem::size_of::() as libc::socklen_t { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "short IPv6 socket address from kernel", + )); + } + + let addr = unsafe { *(storage as *const _ as *const libc::sockaddr_in6) }; + Ok(SocketAddr::V6(SocketAddrV6::new( + Ipv6Addr::from(addr.sin6_addr.s6_addr), + u16::from_be(addr.sin6_port), + addr.sin6_flowinfo, + addr.sin6_scope_id, + ))) + } + family => Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unsupported socket address family {family}"), + )), + } +} + +#[derive(Clone, Copy)] +struct RawSocketAddr { + storage: libc::sockaddr_storage, + len: libc::socklen_t, +} + +impl RawSocketAddr { + fn from_socket_addr(addr: SocketAddr) -> Self { + match addr { + SocketAddr::V4(addr) => { + let sockaddr = libc::sockaddr_in { + sin_family: libc::AF_INET as libc::sa_family_t, + sin_port: addr.port().to_be(), + sin_addr: libc::in_addr { + s_addr: u32::from_ne_bytes(addr.ip().octets()), + }, + sin_zero: [0; 8], + }; + let mut storage = + unsafe { MaybeUninit::::zeroed().assume_init() }; + unsafe { + std::ptr::write( + &mut storage as *mut libc::sockaddr_storage as *mut libc::sockaddr_in, + sockaddr, + ); + } + Self { + storage, + len: std::mem::size_of::() as libc::socklen_t, + } + } + SocketAddr::V6(addr) => { + let sockaddr = libc::sockaddr_in6 { + sin6_family: libc::AF_INET6 as libc::sa_family_t, + sin6_port: addr.port().to_be(), + sin6_flowinfo: addr.flowinfo(), + sin6_addr: libc::in6_addr { + s6_addr: addr.ip().octets(), + }, + sin6_scope_id: addr.scope_id(), + }; + let mut storage = + unsafe { MaybeUninit::::zeroed().assume_init() }; + unsafe { + std::ptr::write( + &mut storage as *mut libc::sockaddr_storage as *mut libc::sockaddr_in6, + sockaddr, + ); + } + Self { + storage, + len: std::mem::size_of::() as libc::socklen_t, + } + } + } + } + + fn as_ptr(&self) -> *const libc::sockaddr { + &self.storage as *const libc::sockaddr_storage as *const libc::sockaddr + } + + fn len(&self) -> libc::socklen_t { + self.len + } +} + +fn cqe_to_result(cqe: IoUringCqe) -> io::Result { + if cqe.res < 0 { + Err(io::Error::from_raw_os_error(-cqe.res)) + } else { + Ok(cqe.res) + } +} + +fn cvt(value: libc::c_int) -> io::Result { + if value == -1 { + Err(io::Error::last_os_error()) + } else { + Ok(value) + } +} + +fn should_fallback_to_offload(error: &io::Error) -> bool { + matches!( + error.raw_os_error(), + Some(libc::EINVAL | libc::ENOSYS | libc::EOPNOTSUPP) + ) +} + +fn socket_sync(domain: i32, socket_type: i32, protocol: i32, flags: u32) -> io::Result { + let fd = cvt(unsafe { libc::socket(domain, socket_type | flags as i32, protocol) })?; + Ok(unsafe { OwnedFd::from_raw_fd(fd) }) +} + +fn connect_sync(fd: RawFd, addr: RawSocketAddr) -> io::Result<()> { + cvt(unsafe { libc::connect(fd, addr.as_ptr(), addr.len()) }).map(|_| ()) +} + +fn bind_sync(fd: RawFd, addr: RawSocketAddr) -> io::Result<()> { + cvt(unsafe { libc::bind(fd, addr.as_ptr(), addr.len()) }).map(|_| ()) +} + +fn listen_sync(fd: RawFd, backlog: i32) -> io::Result<()> { + cvt(unsafe { libc::listen(fd, backlog) }).map(|_| ()) +} + +fn accept_sync(fd: RawFd) -> io::Result { + let mut storage = MaybeUninit::::zeroed(); + let mut len = std::mem::size_of::() as libc::socklen_t; + let accepted_fd = cvt(unsafe { + libc::accept4( + fd, + storage.as_mut_ptr().cast::(), + &mut len, + libc::SOCK_CLOEXEC, + ) + })?; + let storage = unsafe { storage.assume_init() }; + let peer_addr = socket_addr_from_storage(&storage, len)?; + Ok(AcceptedSocket { + fd: accepted_fd, + peer_addr, + }) +} + +fn send_sync(fd: RawFd, data: Vec, flags: i32) -> io::Result { + let written = unsafe { libc::send(fd, data.as_ptr().cast::(), data.len(), flags) }; + cvt_long(written).map(|written| written as usize) +} + +fn send_to_sync(fd: RawFd, data: Vec, target: RawSocketAddr, flags: i32) -> io::Result { + let written = unsafe { + libc::sendto( + fd, + data.as_ptr().cast::(), + data.len(), + flags, + target.as_ptr(), + target.len(), + ) + }; + cvt_long(written).map(|written| written as usize) +} + +fn recv_sync(fd: RawFd, len: usize, flags: i32) -> io::Result> { + let mut buffer = vec![0; len]; + let read = unsafe { + libc::recv( + fd, + buffer.as_mut_ptr().cast::(), + buffer.len(), + flags, + ) + }; + let read = cvt_long(read)? as usize; + buffer.truncate(read); + Ok(buffer) +} + +fn recv_from_sync(fd: RawFd, len: usize, flags: i32) -> io::Result { + let mut buffer = vec![0; len]; + let mut storage = MaybeUninit::::zeroed(); + let mut addr_len = std::mem::size_of::() as libc::socklen_t; + let read = unsafe { + libc::recvfrom( + fd, + buffer.as_mut_ptr().cast::(), + buffer.len(), + flags, + storage.as_mut_ptr().cast::(), + &mut addr_len, + ) + }; + let read = cvt_long(read)? as usize; + buffer.truncate(read); + let storage = unsafe { storage.assume_init() }; + let peer_addr = socket_addr_from_storage(&storage, addr_len)?; + Ok(ReceivedDatagram { + data: buffer, + peer_addr, + }) +} + +fn shutdown_sync(fd: RawFd, how: Shutdown) -> io::Result<()> { + cvt(unsafe { libc::shutdown(fd, shutdown_how(how)) }).map(|_| ()) +} + +fn close_sync(fd: RawFd) -> io::Result<()> { + cvt(unsafe { libc::close(fd) }).map(|_| ()) +} + +fn connect_stream_timeout_sync(addr: SocketAddr, timeout: Duration) -> io::Result { + let fd = cvt(unsafe { + libc::socket( + socket_domain(addr), + libc::SOCK_STREAM | libc::SOCK_CLOEXEC | libc::SOCK_NONBLOCK, + 0, + ) + })?; + let raw_addr = RawSocketAddr::from_socket_addr(addr); + let connect_result = unsafe { libc::connect(fd, raw_addr.as_ptr(), raw_addr.len()) }; + if connect_result == 0 { + set_nonblocking(fd, false)?; + return Ok(unsafe { OwnedFd::from_raw_fd(fd) }); + } + + let error = io::Error::last_os_error(); + if error.raw_os_error() != Some(libc::EINPROGRESS) { + let _ = close_sync(fd); + return Err(error); + } + + let completion = wait_socket(fd, libc::POLLOUT, timeout) + .and_then(|_| getsockopt_int(fd, libc::SOL_SOCKET, libc::SO_ERROR)); + match completion { + Ok(0) => { + set_nonblocking(fd, false)?; + Ok(unsafe { OwnedFd::from_raw_fd(fd) }) + } + Ok(code) => { + let _ = close_sync(fd); + Err(io::Error::from_raw_os_error(code)) + } + Err(error) => { + let _ = close_sync(fd); + Err(error) + } + } +} + +fn set_nonblocking(fd: RawFd, enabled: bool) -> io::Result<()> { + let flags = cvt(unsafe { libc::fcntl(fd, libc::F_GETFL) })?; + let new_flags = if enabled { + flags | libc::O_NONBLOCK + } else { + flags & !libc::O_NONBLOCK + }; + cvt(unsafe { libc::fcntl(fd, libc::F_SETFL, new_flags) }).map(|_| ()) +} + +fn wait_socket(fd: RawFd, events: i16, timeout: Duration) -> io::Result<()> { + let timeout_ms = timeout + .as_millis() + .min(i32::MAX as u128) + .try_into() + .unwrap_or(i32::MAX); + + loop { + let mut poll_fd = libc::pollfd { + fd, + events, + revents: 0, + }; + let result = unsafe { libc::poll(&mut poll_fd, 1, timeout_ms) }; + if result == 0 { + return Err(io::Error::new( + io::ErrorKind::TimedOut, + "socket operation timed out", + )); + } + if result < 0 { + let error = io::Error::last_os_error(); + if error.kind() == io::ErrorKind::Interrupted { + continue; + } + return Err(error); + } + if poll_fd.revents & (libc::POLLERR | libc::POLLHUP | libc::POLLNVAL) != 0 { + let socket_error = getsockopt_int(fd, libc::SOL_SOCKET, libc::SO_ERROR).unwrap_or(0); + if socket_error != 0 { + return Err(io::Error::from_raw_os_error(socket_error)); + } + } + return Ok(()); + } +} + +fn cvt_long(value: libc::ssize_t) -> io::Result { + if value == -1 { + Err(io::Error::last_os_error()) + } else { + Ok(value) + } +} diff --git a/lib/runtime/src/sys/mod.rs b/lib/runtime/src/sys/mod.rs new file mode 100644 index 0000000..0260205 --- /dev/null +++ b/lib/runtime/src/sys/mod.rs @@ -0,0 +1,4 @@ +//! Platform backend implementations. + +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub mod linux; diff --git a/lib/runtime/src/time.rs b/lib/runtime/src/time.rs new file mode 100644 index 0000000..35cda8c --- /dev/null +++ b/lib/runtime/src/time.rs @@ -0,0 +1,175 @@ +//! Runtime time primitives. + +use std::cell::{Cell, RefCell}; +use std::fmt; +use std::future::{Future, poll_fn}; +use std::io; +use std::pin::Pin; +use std::rc::Rc; +use std::task::Waker; +use std::task::{Context, Poll}; +use std::time::Duration; + +use crate::{clear_timeout, set_timeout}; + +pub struct Sleep { + delay: Option, + state: Option>, + handle: Option, + completed: bool, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Elapsed; + +pub fn sleep(duration: Duration) -> Sleep { + Sleep { + delay: Some(duration), + state: None, + handle: None, + completed: false, + } +} + +pub async fn timeout(duration: Duration, future: F) -> Result +where + F: Future, +{ + let mut future = std::pin::pin!(future); + let mut sleeper = std::pin::pin!(sleep(duration)); + + poll_fn(|cx| { + if let Poll::Ready(output) = future.as_mut().poll(cx) { + return Poll::Ready(Ok(output)); + } + + if let Poll::Ready(()) = sleeper.as_mut().poll(cx) { + return Poll::Ready(Err(Elapsed)); + } + + Poll::Pending + }) + .await +} + +pub fn timeout_error(action: &'static str) -> io::Error { + io::Error::new(io::ErrorKind::TimedOut, format!("{action} timed out")) +} + +impl Future for Sleep { + type Output = (); + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + if self.completed { + return Poll::Ready(()); + } + + if self.state.is_none() { + let delay = self.delay.take().unwrap_or(Duration::ZERO); + let state = Rc::new(SleepState::default()); + let state_for_callback = Rc::clone(&state); + let timeout_handle = set_timeout(delay, move || state_for_callback.complete()); + self.state = Some(state); + self.handle = Some(timeout_handle); + } + + let state = self + .state + .as_ref() + .expect("sleep state should be initialized"); + if state.ready.get() { + self.completed = true; + self.state = None; + self.handle = None; + Poll::Ready(()) + } else { + *state.waker.borrow_mut() = Some(cx.waker().clone()); + if state.ready.get() { + self.completed = true; + self.state = None; + self.handle = None; + Poll::Ready(()) + } else { + Poll::Pending + } + } + } +} + +impl Drop for Sleep { + fn drop(&mut self) { + if self.completed { + return; + } + + if let Some(handle) = self.handle.take() { + clear_timeout(&handle); + } + } +} + +#[derive(Default)] +struct SleepState { + ready: Cell, + waker: RefCell>, +} + +impl SleepState { + fn complete(&self) { + self.ready.set(true); + if let Some(waker) = self.waker.borrow_mut().take() { + waker.wake(); + } + } +} + +impl fmt::Display for Elapsed { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("deadline elapsed") + } +} + +impl std::error::Error for Elapsed {} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + use std::time::Duration; + + use crate::{queue_future, queue_task, run}; + + use super::{sleep, timeout}; + + #[test] + fn sleep_and_timeout_work() { + let log = std::thread::spawn(|| { + let log = Arc::new(Mutex::new(Vec::new())); + let log_for_task = Arc::clone(&log); + + queue_task(move || { + let log_for_task = Arc::clone(&log_for_task); + queue_future(async move { + log_for_task.lock().unwrap().push("started"); + sleep(Duration::from_millis(5)).await; + log_for_task.lock().unwrap().push("slept"); + + let result = timeout(Duration::from_millis(5), async { + sleep(Duration::from_millis(20)).await; + 42usize + }) + .await; + assert!(result.is_err(), "timeout should fire first"); + log_for_task.lock().unwrap().push("timed out"); + }); + }); + run(); + + let log = log.lock().unwrap(); + log.clone() + }) + .join() + .expect("time test thread should join successfully"); + + assert_eq!(log.as_slice(), ["started", "slept", "timed out"]); + } +} diff --git a/lib/runtime_proc_macros/Cargo.toml b/lib/runtime_proc_macros/Cargo.toml new file mode 100644 index 0000000..4f3cc63 --- /dev/null +++ b/lib/runtime_proc_macros/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "ruin-runtime-proc-macros" +version = "0.1.0" +edition = "2024" + +[lib] +proc-macro = true + +[dependencies] +proc-macro2 = "1" +quote = "1" +syn = { version = "2", features = ["full"] } diff --git a/lib/runtime_proc_macros/src/lib.rs b/lib/runtime_proc_macros/src/lib.rs new file mode 100644 index 0000000..d6be52a --- /dev/null +++ b/lib/runtime_proc_macros/src/lib.rs @@ -0,0 +1,128 @@ +use proc_macro::TokenStream; +use proc_macro2::Span; +use quote::{format_ident, quote}; +use syn::{Error, ItemFn, parse_macro_input}; + +#[proc_macro_attribute] +pub fn main(attr: TokenStream, item: TokenStream) -> TokenStream { + expand_entry(attr, item, EntryKind::Sync) +} + +#[proc_macro_attribute] +pub fn async_main(attr: TokenStream, item: TokenStream) -> TokenStream { + expand_entry(attr, item, EntryKind::Async) +} + +#[derive(Clone, Copy)] +enum EntryKind { + Sync, + Async, +} + +fn expand_entry(attr: TokenStream, item: TokenStream, kind: EntryKind) -> TokenStream { + if !proc_macro2::TokenStream::from(attr).is_empty() { + return Error::new( + Span::call_site(), + "ruin runtime entry attributes take no arguments", + ) + .to_compile_error() + .into(); + } + + let function = parse_macro_input!(item as ItemFn); + match validate_entry(&function, kind) { + Ok(()) => generate_entry(function, kind).into(), + Err(error) => error.to_compile_error().into(), + } +} + +fn validate_entry(function: &ItemFn, kind: EntryKind) -> syn::Result<()> { + let signature = &function.sig; + + if signature.ident != "main" { + return Err(Error::new_spanned( + &signature.ident, + "ruin runtime entry attribute must be attached to a function named `main`", + )); + } + + if !signature.inputs.is_empty() { + return Err(Error::new_spanned( + &signature.inputs, + "ruin runtime entry functions cannot take parameters", + )); + } + + if !signature.generics.params.is_empty() || signature.generics.where_clause.is_some() { + return Err(Error::new_spanned( + &signature.generics, + "ruin runtime entry functions cannot be generic", + )); + } + + if signature.constness.is_some() { + return Err(Error::new_spanned( + signature.fn_token, + "ruin runtime entry functions cannot be const", + )); + } + + if signature.unsafety.is_some() { + return Err(Error::new_spanned( + signature.fn_token, + "ruin runtime entry functions cannot be unsafe", + )); + } + + if signature.abi.is_some() { + return Err(Error::new_spanned( + &signature.abi, + "ruin runtime entry functions cannot declare an ABI", + )); + } + + if signature.variadic.is_some() { + return Err(Error::new_spanned( + &signature.variadic, + "ruin runtime entry functions cannot be variadic", + )); + } + + match kind { + EntryKind::Sync if signature.asyncness.is_some() => Err(Error::new_spanned( + signature.asyncness, + "#[ruin_runtime::main] expects a non-async `fn main`", + )), + EntryKind::Async if signature.asyncness.is_none() => Err(Error::new_spanned( + signature.fn_token, + "#[ruin_runtime::async_main] expects an `async fn main`", + )), + _ => Ok(()), + } +} + +fn generate_entry(mut function: ItemFn, kind: EntryKind) -> proc_macro2::TokenStream { + let original_name = function.sig.ident.clone(); + let implementation_name = format_ident!("__ruin_runtime_internal_{}", original_name); + function.sig.ident = implementation_name.clone(); + + let entry_call = match kind { + EntryKind::Sync => quote! { + ::ruin_runtime::queue_task(|| { + let _ = #implementation_name(); + }); + }, + EntryKind::Async => quote! { + let _ = ::ruin_runtime::queue_future(#implementation_name()); + }, + }; + + quote! { + #function + + fn #original_name() { + #entry_call + ::ruin_runtime::run(); + } + } +}