Skip to content

feat(virtio-block): Add support for VIRTIO_BLK_T_DISCARD request type #5168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/vmm/src/devices/virtio/block/virtio/device.rs
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@ use crate::devices::virtio::block::CacheType;
use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice};
use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice};
use crate::devices::virtio::generated::virtio_blk::{
VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES,
VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_F_DISCARD,
};
use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1;
use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
@@ -295,7 +295,8 @@ impl VirtioBlock {
.map_err(VirtioBlockError::RateLimiter)?
.unwrap_or_default();

let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX);
let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX)
| (1u64 << VIRTIO_BLK_F_DISCARD);

if config.cache_type == CacheType::Writeback {
avail_features |= 1u64 << VIRTIO_BLK_F_FLUSH;
23 changes: 22 additions & 1 deletion src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
Original file line number Diff line number Diff line change
@@ -110,7 +110,7 @@ impl AsyncFileEngine {
Ok(())
}

#[cfg(test)]

pub fn file(&self) -> &File {
&self.file
}
@@ -230,6 +230,27 @@ impl AsyncFileEngine {
Ok(())
}

pub fn push_discard(
&mut self,
offset: u64,
len: u32,
req: PendingRequest,
) -> Result<(), RequestError<AsyncIoError>> {
let wrapped_user_data = WrappedRequest::new(req);

self.ring
.push(Operation::fallocate(
0,
len,
offset,
wrapped_user_data,
))
.map_err(|(io_uring_error, data)| RequestError {
req: data.req,
error: AsyncIoError::IoUring(io_uring_error),
})
}

fn do_pop(&mut self) -> Result<Option<Cqe<WrappedRequest>>, AsyncIoError> {
self.ring.pop().map_err(AsyncIoError::IoUring)
}
31 changes: 30 additions & 1 deletion src/vmm/src/devices/virtio/block/virtio/io/mod.rs
Original file line number Diff line number Diff line change
@@ -6,9 +6,12 @@ pub mod sync_io;

use std::fmt::Debug;
use std::fs::File;
use libc::{c_int, off64_t, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE};
use std::os::unix::io::AsRawFd;

pub use self::async_io::{AsyncFileEngine, AsyncIoError};
pub use self::sync_io::{SyncFileEngine, SyncIoError};
use crate::device_manager::mmio::MMIO_LEN;
use crate::devices::virtio::block::virtio::PendingRequest;
use crate::devices::virtio::block::virtio::device::FileEngineType;
use crate::vstate::memory::{GuestAddress, GuestMemoryMmap};
@@ -75,7 +78,7 @@ impl FileEngine {
Ok(())
}

#[cfg(test)]

pub fn file(&self) -> &File {
match self {
FileEngine::Async(engine) => engine.file(),
@@ -172,6 +175,32 @@ impl FileEngine {
FileEngine::Sync(engine) => engine.flush().map_err(BlockIoError::Sync),
}
}

pub fn discard(
&mut self,
offset: u64,
count: u32,
req: PendingRequest,
) -> Result<FileEngineOk, RequestError<BlockIoError>> {

match self {
FileEngine::Async(engine) => match engine.push_discard(offset, count, req) {
Ok(_) => Ok(FileEngineOk::Submitted),
Err(err) => Err(RequestError {
req: err.req,
error: BlockIoError::Async(err.error),
}),
},
FileEngine::Sync(engine) => match engine.discard(offset,count) {
Ok(count) => Ok(FileEngineOk::Executed(RequestOk { req, count })),
Err(err) => Err(RequestError {
req,
error: BlockIoError::Sync(err),
}),
},
}
}

}

#[cfg(test)]
32 changes: 31 additions & 1 deletion src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs
Original file line number Diff line number Diff line change
@@ -3,6 +3,8 @@

use std::fs::File;
use std::io::{Seek, SeekFrom, Write};
use libc::{c_int, off64_t, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE};
use std::os::unix::io::AsRawFd;

use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile};

@@ -18,6 +20,8 @@ pub enum SyncIoError {
SyncAll(std::io::Error),
/// Transfer: {0}
Transfer(GuestMemoryError),
/// Discard: {0}
Discard(std::io::Error)
}

#[derive(Debug)]
@@ -33,7 +37,7 @@ impl SyncFileEngine {
SyncFileEngine { file }
}

#[cfg(test)]

pub fn file(&self) -> &File {
&self.file
}
@@ -81,4 +85,30 @@ impl SyncFileEngine {
// Sync data out to physical media on host.
self.file.sync_all().map_err(SyncIoError::SyncAll)
}

pub fn discard(&mut self, offset: u64, len: u32) -> Result<u32, SyncIoError> {

unsafe {
let ret = libc::fallocate(
self.file.as_raw_fd(),
libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
offset as i64,
len as i64,
);
if ret != 0 {
return Err(SyncIoError::Discard(std::io::Error::last_os_error()));
}
}
Ok(len)
}

pub fn fallocate(fd: c_int, mode: i32, offset: off64_t, len: off64_t) -> Result<(), std::io::Error> {
let ret: i32 = unsafe { libc::fallocate(fd, mode, offset, len) };
if ret == 0 {
Ok(())
} else {
Err(std::io::Error::last_os_error())
}
}

}
3 changes: 3 additions & 0 deletions src/vmm/src/devices/virtio/block/virtio/metrics.rs
Original file line number Diff line number Diff line change
@@ -157,6 +157,8 @@ pub struct BlockDeviceMetrics {
pub invalid_reqs_count: SharedIncMetric,
/// Number of flushes operation triggered on this block device.
pub flush_count: SharedIncMetric,
/// Number of discard operation triggered on this block device.
pub discard_count: SharedIncMetric,
/// Number of events triggered on the queue of this block device.
pub queue_event_count: SharedIncMetric,
/// Number of events ratelimiter-related.
@@ -210,6 +212,7 @@ impl BlockDeviceMetrics {
self.invalid_reqs_count
.add(other.invalid_reqs_count.fetch_diff());
self.flush_count.add(other.flush_count.fetch_diff());
self.discard_count.add(other.discard_count.fetch_diff());
self.queue_event_count
.add(other.queue_event_count.fetch_diff());
self.rate_limiter_event_count
8 changes: 8 additions & 0 deletions src/vmm/src/devices/virtio/block/virtio/mod.rs
Original file line number Diff line number Diff line change
@@ -63,4 +63,12 @@ pub enum VirtioBlockError {
RateLimiter(std::io::Error),
/// Persistence error: {0}
Persist(crate::devices::virtio::persist::PersistError),
/// Sector overflow in discard segment
SectorOverflow,
/// Discard segment exceeds disk size
BeyondDiskSize,
/// Invalid flags in discard segment
InvalidDiscardFlags,
/// Invalid discard request (e.g., empty segments)
InvalidDiscardRequest,
}
77 changes: 75 additions & 2 deletions src/vmm/src/devices/virtio/block/virtio/request.rs
Original file line number Diff line number Diff line change
@@ -9,17 +9,19 @@ use std::convert::From;

use vm_memory::GuestMemoryError;

use super::io::{BlockIoError, SyncIoError};
use super::{SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io};
use crate::devices::virtio::block::virtio::device::DiskProperties;
use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics;
pub use crate::devices::virtio::generated::virtio_blk::{
VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP,
VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT,
VIRTIO_BLK_T_DISCARD
};
use crate::devices::virtio::queue::DescriptorChain;
use crate::logger::{IncMetric, error};
use crate::rate_limiter::{RateLimiter, TokenType};
use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap};
use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap, Address};

#[derive(Debug, derive_more::From)]
pub enum IoErr {
@@ -34,6 +36,7 @@ pub enum RequestType {
Out,
Flush,
GetDeviceID,
Discard,
Unsupported(u32),
}

@@ -44,6 +47,7 @@ impl From<u32> for RequestType {
VIRTIO_BLK_T_OUT => RequestType::Out,
VIRTIO_BLK_T_FLUSH => RequestType::Flush,
VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID,
VIRTIO_BLK_T_DISCARD => RequestType::Discard,
t => RequestType::Unsupported(t),
}
}
@@ -176,6 +180,12 @@ impl PendingRequest {
(Ok(transferred_data_len), RequestType::GetDeviceID) => {
Status::from_data(self.data_len, transferred_data_len, true)
}
(Ok(_), RequestType::Discard) => {
block_metrics.discard_count.inc();
Status::Ok {
num_bytes_to_mem: 0,
}
}
(_, RequestType::Unsupported(op)) => Status::Unsupported { op },
(Err(err), _) => Status::IoErr {
num_bytes_to_mem: 0,
@@ -231,13 +241,24 @@ impl RequestHeader {
}
}

// For this, please see VirtIO v1.2. This struct mimics that of the implementation in Virtio.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[repr(C)]
pub struct DiscardSegment {
sector: u64,
num_sectors: u32,
flags: u32,
}
Comment on lines +247 to +251
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

flags is not just a 32 bit integer, it is a struct with 2 u32 inside. Also it needs to have a C struct layout attribute.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for delay, was busy with other things but- I thought that flags was a single 32-bit integer, where 1 bit was for unmap, and the other 31 bits are reserved for struct alignment I'm assuming.

For reference, I'm using VirtI/O 1.2 Documentation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright, you are right, I totally missed the bitfield syntax in C definition.
Still you need to add the [repr(C)] to the struct definition.

unsafe impl ByteValued for DiscardSegment {}

#[derive(Debug, PartialEq, Eq)]
pub struct Request {
pub r#type: RequestType,
pub data_len: u32,
pub status_addr: GuestAddress,
sector: u64,
data_addr: GuestAddress,
discard_segments: Option<Vec<DiscardSegment>>,
}

impl Request {
@@ -258,10 +279,11 @@ impl Request {
data_addr: GuestAddress(0),
data_len: 0,
status_addr: GuestAddress(0),
discard_segments: None
};

let data_desc;
let status_desc;
let mut status_desc;
let desc = avail_desc
.next_descriptor()
.ok_or(VirtioBlockError::DescriptorChainTooShort)?;
@@ -272,6 +294,7 @@ impl Request {
if req.r#type != RequestType::Flush {
return Err(VirtioBlockError::DescriptorChainTooShort);
}
data_desc = desc;
} else {
data_desc = desc;
status_desc = data_desc
@@ -287,6 +310,9 @@ impl Request {
if !data_desc.is_write_only() && req.r#type == RequestType::GetDeviceID {
return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor);
}
if data_desc.is_write_only() && req.r#type == RequestType::Discard {
return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor);
}

req.data_addr = data_desc.addr;
req.data_len = data_desc.len;
@@ -313,6 +339,45 @@ impl Request {
return Err(VirtioBlockError::InvalidDataLength);
}
}
RequestType::Discard => {
// Validate data length
let segment_size = std::mem::size_of::<DiscardSegment>() as u32;
if data_desc.len % segment_size != 0 {
return Err(VirtioBlockError::InvalidDataLength);
}

// Calculate number of segments
let num_segments = data_desc.len / segment_size;
if num_segments == 0 {
return Err(VirtioBlockError::InvalidDiscardRequest);
}
let mut segments = Vec::with_capacity(num_segments as usize);
Copy link
Contributor

@ShadowCurse ShadowCurse Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can avoid having to deal with multiple segments if we just limit their number to 1 in the config space of the device. You will need to expand the ConfigSpace type to include max_discard_sectors and everything before it. Also in all other code please avoid dynamic allocations, it can cause issues in the future.


// Populate DiscardSegments vector
for i in 0..num_segments {
let offset = i * segment_size;
let segment: DiscardSegment = mem.read_obj(data_desc.addr.unchecked_add(offset as u64))
.map_err(VirtioBlockError::GuestMemory)?;
if segment.flags & !0x1 != 0 {
return Err(VirtioBlockError::InvalidDiscardFlags);
}
let end_sector = segment.sector.checked_add(segment.num_sectors as u64)
.ok_or(VirtioBlockError::SectorOverflow)?;
if end_sector > num_disk_sectors {
return Err(VirtioBlockError::BeyondDiskSize);
}
segments.push(segment);
}

// Assign to req.discard_segments
req.discard_segments = Some(segments);
req.data_len = data_desc.len;
status_desc = data_desc.next_descriptor()
.ok_or(VirtioBlockError::DescriptorChainTooShort)?;
if !status_desc.is_write_only() {
return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor);
}
}
_ => {}
}

@@ -390,6 +455,11 @@ impl Request {
.map_err(IoErr::GetId);
return ProcessingResult::Executed(pending.finish(mem, res, block_metrics));
}
RequestType::Discard => {
let _metric = block_metrics.write_agg.record_latency_metrics();
disk.file_engine
.discard(self.offset(), self.data_len, pending)
}
RequestType::Unsupported(_) => {
return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics));
}
@@ -730,6 +800,7 @@ mod tests {
RequestType::Out => VIRTIO_BLK_T_OUT,
RequestType::Flush => VIRTIO_BLK_T_FLUSH,
RequestType::GetDeviceID => VIRTIO_BLK_T_GET_ID,
RequestType::Discard => VIRTIO_BLK_T_DISCARD,
RequestType::Unsupported(id) => id,
}
}
@@ -742,6 +813,7 @@ mod tests {
RequestType::Out => VIRTQ_DESC_F_NEXT,
RequestType::Flush => VIRTQ_DESC_F_NEXT,
RequestType::GetDeviceID => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE,
RequestType::Discard => VIRTQ_DESC_F_NEXT,
RequestType::Unsupported(_) => VIRTQ_DESC_F_NEXT,
}
}
@@ -833,6 +905,7 @@ mod tests {
status_addr,
sector: sector & (NUM_DISK_SECTORS - sectors_len),
data_addr,
discard_segments: None
};
let mut request_header = RequestHeader::new(virtio_request_id, request.sector);

16 changes: 16 additions & 0 deletions src/vmm/src/io_uring/operation/mod.rs
Original file line number Diff line number Diff line change
@@ -29,6 +29,8 @@ pub enum OpCode {
Write = generated::IORING_OP_WRITE as u8,
/// Fsync operation.
Fsync = generated::IORING_OP_FSYNC as u8,
/// Fallocate operation.
Fallocate = generated::IORING_OP_FALLOCATE as u8,
}

// Useful for outputting errors.
@@ -38,6 +40,7 @@ impl From<OpCode> for &'static str {
OpCode::Read => "read",
OpCode::Write => "write",
OpCode::Fsync => "fsync",
OpCode::Fallocate => "fallocate",
}
}
}
@@ -113,6 +116,19 @@ impl<T: Debug> Operation<T> {
}
}

/// Construct a fallocate operation.
pub fn fallocate(fd: FixedFd, len: u32, offset: u64, user_data: T) -> Self {
Self {
fd,
opcode: OpCode::Fallocate,
addr: None,
len: Some(len),
flags: 0,
offset: Some(offset),
user_data,
}
}

pub(crate) fn fd(&self) -> FixedFd {
self.fd
}
38 changes: 38 additions & 0 deletions tests/integration_tests/functional/test_discard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest
import os
import host_tools.drive as drive_tools


def test_discard_support(uvm_plain_rw):
"""
Test the VIRTIO_BLK_T_DISCARD feature for valid and invalid requests.
"""

vm = uvm_plain_rw
vm.spawn()
vm.basic_config()
vm.add_net_iface()

fs1 = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "test_disk"), size=128)
vm.add_drive(
drive_id="test_disk",
path_on_host=fs1.path,
is_root_device=False,
is_read_only=False,
)

vm.start()

exit_code, stdout, _ = vm.ssh.run("cat /sys/block/vda/queue/discard_max_bytes")

assert exit_code == 0, "Failed to read discard_max_bytes"
assert int(stdout.strip()) > 0, f"discard_max_bytes is 0: {stdout}"

vm.ssh.run("mount -o remount,discard /")
exit_code, stdout, _ = vm.ssh.run("mount | grep /dev/vda")
assert exit_code == 0, f"Failed to remount root filesystem with discard option: {stdout}"

exit_code, stdout, _ = vm.ssh.run("fstrim -v /")
assert exit_code == 0, f"fstrim -v failed: {stdout}"
assert "bytes trimmed" in stdout, f"Unexpected fstrim output: {stdout}"
vm.kill()