From 7abcad198be841b2a0991bc7b047cfba50757c8e Mon Sep 17 00:00:00 2001 From: Gaelan Steele Date: Wed, 8 Mar 2023 16:08:15 +0100 Subject: [PATCH] scsi: Add virtio daemon This adds the virtio-specific parts that use the previously formed interfaces and scsi emulation in order to build a daemon that offers files from the host system as drives to the guest. The vast majority of this work was done by Gaelan Steele as part of a GSoC project [1][2]. [1] https://github.com/rust-vmm/vhost-device/pull/4 [2] https://gist.github.com/Gaelan/febec4e4606e1320026a0924c3bf74d0 Co-developed-by: Erik Schilling Signed-off-by: Erik Schilling Signed-off-by: Gaelan Steele --- crates/scsi/CHANGELOG.md | 3 + crates/scsi/src/lib.rs | 3 - crates/scsi/src/main.rs | 129 ++++++++++++++- crates/scsi/src/vhu_scsi.rs | 283 ++++++++++++++++++++++++++++++++ crates/scsi/src/virtio.rs | 313 ++++++++++++++++++++++++++++++++++++ 5 files changed, 726 insertions(+), 5 deletions(-) create mode 100644 crates/scsi/CHANGELOG.md delete mode 100644 crates/scsi/src/lib.rs create mode 100644 crates/scsi/src/vhu_scsi.rs create mode 100644 crates/scsi/src/virtio.rs diff --git a/crates/scsi/CHANGELOG.md b/crates/scsi/CHANGELOG.md new file mode 100644 index 0000000..d471959 --- /dev/null +++ b/crates/scsi/CHANGELOG.md @@ -0,0 +1,3 @@ +# Upcoming Release + +- First initial daemon implementation. diff --git a/crates/scsi/src/lib.rs b/crates/scsi/src/lib.rs deleted file mode 100644 index 25dcad9..0000000 --- a/crates/scsi/src/lib.rs +++ /dev/null @@ -1,3 +0,0 @@ -// We do not use any of this yet -#[allow(dead_code)] -mod scsi; diff --git a/crates/scsi/src/main.rs b/crates/scsi/src/main.rs index 5bf256e..bfb8ec2 100644 --- a/crates/scsi/src/main.rs +++ b/crates/scsi/src/main.rs @@ -1,3 +1,128 @@ -fn main() { - println!("Hello world"); +// SPDX-License-Identifier: Apache-2.0 or BSD-3-Clause + +mod scsi; +mod vhu_scsi; +mod virtio; + +use std::{ + fs::File, + path::PathBuf, + sync::{Arc, RwLock}, +}; + +use clap::Parser; +use log::{error, info, warn}; +use thiserror::Error as ThisError; +use vhost::vhost_user::{self, Listener}; +use vhost_user_backend::VhostUserDaemon; +use vm_memory::{GuestMemoryAtomic, GuestMemoryMmap}; + +use crate::scsi::emulation::{ + block_device::{BlockDevice, FileBackend, MediumRotationRate}, + target::EmulatedTarget, +}; +use crate::vhu_scsi::VhostUserScsiBackend; + +#[derive(Debug, ThisError)] +enum Error { + #[error("More than 256 LUNs aren't currently supported")] + TooManyLUNs, + #[error("Failed creating listener: {0}")] + FailedCreatingListener(vhost_user::Error), +} + +type Result = std::result::Result; + +#[derive(Parser)] +struct ScsiArgs { + /// Make the images read-only. + /// + /// Currently, we don't actually support writes, but sometimes we want to + /// pretend the disk is writable to work around issues with some tools that + /// use the Linux SCSI generic API. + #[arg(long = "read-only", short = 'r')] + read_only: bool, + /// Tell the guest this disk is non-rotational. + /// + /// Affects some heuristics in Linux around, for example, scheduling. + #[arg(long = "solid-state")] + solid_state: bool, + /// Location of vhost-user socket. + #[clap(short, long)] + socket_path: PathBuf, + /// Images against which the SCSI actions are emulated. + images: Vec, +} + +fn create_backend(args: &ScsiArgs) -> Result { + let mut backend = VhostUserScsiBackend::new(); + let mut target = EmulatedTarget::new(); + + if args.images.len() > 256 { + // This is fairly simple to add; it's just a matter of supporting the right LUN + // encoding formats. + error!("Currently only up to 256 targets are supported"); + return Err(Error::TooManyLUNs); + } + + if !args.read_only { + warn!("Currently, only read-only images are supported. Unless you know what you're doing, you want to pass -r"); + } + + for image in &args.images { + let mut dev = BlockDevice::new(FileBackend::new(File::open(image).expect("Opening image"))); + dev.set_write_protected(args.read_only); + dev.set_solid_state(if args.solid_state { + MediumRotationRate::NonRotating + } else { + MediumRotationRate::Unreported + }); + target.add_lun(Box::new(dev)); + } + + backend.add_target(Box::new(target)); + Ok(backend) +} + +fn start_backend(backend: VhostUserScsiBackend, args: ScsiArgs) -> Result<()> { + let backend = Arc::new(RwLock::new(backend)); + let mut daemon = VhostUserDaemon::new( + "vhost-user-scsi".into(), + Arc::clone(&backend), + GuestMemoryAtomic::new(GuestMemoryMmap::new()), + ) + .expect("Creating daemon"); + + daemon + .start(Listener::new(args.socket_path, true).map_err(Error::FailedCreatingListener)?) + .expect("Starting daemon"); + + match daemon.wait() { + Ok(()) => { + info!("Stopping cleanly."); + } + Err(vhost_user_backend::Error::HandleRequest(vhost_user::Error::PartialMessage)) => { + info!("vhost-user connection closed with partial message. If the VM is shutting down, this is expected behavior; otherwise, it might be a bug."); + } + Err(e) => { + warn!("Error running daemon: {:?}", e); + } + } + + // No matter the result, we need to shut down the worker thread. + // unwrap will only panic if we already panicked somewhere else + backend + .read() + .unwrap() + .exit_event + .write(1) + .expect("Shutting down worker thread"); + Ok(()) +} + +fn main() -> Result<()> { + env_logger::init(); + let args = ScsiArgs::parse(); + let backend = create_backend(&args)?; + start_backend(backend, args) } diff --git a/crates/scsi/src/vhu_scsi.rs b/crates/scsi/src/vhu_scsi.rs new file mode 100644 index 0000000..2cb12e4 --- /dev/null +++ b/crates/scsi/src/vhu_scsi.rs @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: Apache-2.0 or BSD-3-Clause + +use std::convert::TryFrom; +use std::io::{self, ErrorKind}; + +use log::{debug, error, info, warn}; +use vhost::vhost_user::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; +use vhost_user_backend::{VhostUserBackendMut, VringRwLock, VringT}; +use virtio_bindings::{ + virtio_config::VIRTIO_F_VERSION_1, + virtio_ring::{VIRTIO_RING_F_EVENT_IDX, VIRTIO_RING_F_INDIRECT_DESC}, + virtio_scsi::VIRTIO_SCSI_F_HOTPLUG, +}; +use virtio_queue::QueueOwnedT; +use vm_memory::{GuestAddressSpace, GuestMemoryAtomic, GuestMemoryLoadGuard, GuestMemoryMmap}; +use vmm_sys_util::{ + epoll::EventSet, + eventfd::{EventFd, EFD_NONBLOCK}, +}; + +use crate::scsi::Target; +use crate::{ + scsi::{self, CmdError, TaskAttr}, + virtio::{self, Request, RequestParseError, Response, ResponseCode, VirtioScsiLun, SENSE_SIZE}, +}; + +const REQUEST_QUEUE: u16 = 2; + +type DescriptorChainWriter = virtio::DescriptorChainWriter>; +type DescriptorChainReader = virtio::DescriptorChainReader>; + +pub(crate) struct VhostUserScsiBackend { + event_idx: bool, + mem: Option>, + targets: Vec>, + pub(crate) exit_event: EventFd, +} + +impl VhostUserScsiBackend { + pub(crate) fn new() -> Self { + Self { + event_idx: false, + mem: None, + targets: Vec::new(), + exit_event: EventFd::new(EFD_NONBLOCK).expect("Creating exit eventfd"), + } + } + + fn parse_target(&mut self, lun: VirtioScsiLun) -> Option<(&mut Box, u16)> { + match lun { + VirtioScsiLun::TargetLun(target, lun) => self + .targets + .get_mut(usize::from(target)) + .map(|tgt| (tgt, lun)), + VirtioScsiLun::ReportLuns => { + // TODO: do we need to handle the REPORT LUNS well-known LUN? + // In practice, everyone seems to just use LUN 0 + warn!("Guest is trying to use the REPORT LUNS well-known LUN, which we don't support."); + None + } + } + } + + fn process_requests( + &mut self, + reader: &mut DescriptorChainReader, + writer: &mut DescriptorChainWriter, + ) { + let mut body_writer = writer.clone(); + const RESPONSE_HEADER_SIZE: u32 = 12; + body_writer.skip( + RESPONSE_HEADER_SIZE + u32::try_from(SENSE_SIZE).expect("SENSE_SIZE should fit 32bit"), + ); + + let response = match Request::parse(reader) { + Ok(r) => { + if let Some((target, lun)) = self.parse_target(r.lun) { + let output = target.execute_command( + lun, + reader, + &mut body_writer, + scsi::Request { + id: r.id, + cdb: &r.cdb, + task_attr: match r.task_attr { + 0 => TaskAttr::Simple, + 1 => TaskAttr::Ordered, + 2 => TaskAttr::HeadOfQueue, + 3 => TaskAttr::Aca, + _ => { + // virtio-scsi spec allows us to map any task attr to simple, presumably + // including future ones + warn!("Unknown task attr: {}", r.task_attr); + TaskAttr::Simple + } + }, + crn: r.crn, + prio: r.prio, + }, + ); + + match output { + Ok(output) => { + assert!(output.sense.len() < SENSE_SIZE); + + Response { + response: ResponseCode::Ok, + status: output.status, + status_qualifier: output.status_qualifier, + sense: output.sense, + // TODO: handle residual for data in + residual: body_writer.residual(), + } + } + Err(CmdError::CdbTooShort) => { + // the CDB buffer is, by default, sized larger than any CDB we support; we don't + // handle writes to config space (because QEMU doesn't let us), so there's no + // way the guest can set it too small + unreachable!(); + } + Err(CmdError::DataIn(e)) => { + if e.kind() == ErrorKind::WriteZero { + Response::error(ResponseCode::Overrun, 0) + } else { + error!("Error writing response to guest memory: {}", e); + + // There's some chance the header and data in are on different descriptors, + // and only the data in descriptor is bad, so let's at least try to write an + // error to the header + Response::error(ResponseCode::Failure, body_writer.residual()) + } + } + } + } else { + debug!("Rejecting command to LUN with bad target {:?}", r.lun); + Response::error(ResponseCode::BadTarget, body_writer.residual()) + } + } + Err(RequestParseError::CouldNotReadGuestMemory(e)) => { + // See comment later about errors while writing to guest mem; maybe we at least + // got functional write desciptors, so we can report an error + error!("Error reading request from guest memory: {:?}", e); + Response::error(ResponseCode::Failure, body_writer.residual()) + } + Err(RequestParseError::FailedParsingLun(lun)) => { + error!("Unable to parse LUN: {:?}", lun); + Response::error(ResponseCode::Failure, body_writer.residual()) + } + }; + + if let Err(e) = response.write(writer) { + // Alright, so something went wrong writing our response header to guest memory. + // The only reason this should ever happen, I think, is if the guest gave us a + // virtio descriptor with an invalid address. + + // There's not a great way to recover from this - we just discovered that + // our only way of communicating with the guest doesn't work - so we either + // silently fail or crash. There isn't too much sense in crashing, IMO, as + // the guest could still recover by, say, installing a fixed kernel and + // rebooting. So let's just log an error and do nothing. + error!("Error writing response to guest memory: {:?}", e); + } + } + + fn process_request_queue(&mut self, vring: &VringRwLock) -> Result<(), io::Error> { + let chains: Vec<_> = vring + .get_mut() + .get_queue_mut() + .iter(self.mem.as_ref().unwrap().memory()) + .map_err(|e| io::Error::new(ErrorKind::Other, e))? + .collect(); + for dc in chains { + let mut writer = DescriptorChainWriter::new(dc.clone()); + let mut reader = DescriptorChainReader::new(dc.clone()); + + self.process_requests(&mut reader, &mut writer); + + vring + .add_used(dc.head_index(), writer.max_written()) + .map_err(|e| io::Error::new(ErrorKind::Other, e))?; + } + + vring + .signal_used_queue() + .map_err(|e| io::Error::new(ErrorKind::Other, e))?; + Ok(()) + } + + pub(crate) fn add_target(&mut self, target: Box) { + self.targets.push(target); + } +} + +impl VhostUserBackendMut for VhostUserScsiBackend { + fn num_queues(&self) -> usize { + // control + event + request queues + let num_request_queues = 1; + 2 + num_request_queues + } + + fn max_queue_size(&self) -> usize { + 128 // qemu assumes this by default + } + + fn features(&self) -> u64 { + 1 << VIRTIO_F_VERSION_1 + | 1 << VIRTIO_SCSI_F_HOTPLUG + | 1 << VIRTIO_RING_F_INDIRECT_DESC + | 1 << VIRTIO_RING_F_EVENT_IDX + | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() + } + + fn protocol_features(&self) -> VhostUserProtocolFeatures { + VhostUserProtocolFeatures::MQ + } + + fn set_event_idx(&mut self, enabled: bool) { + self.event_idx = enabled; + } + + fn update_memory( + &mut self, + atomic_mem: GuestMemoryAtomic, + ) -> std::result::Result<(), std::io::Error> { + info!("Memory updated - guest probably booting"); + self.mem = Some(atomic_mem); + Ok(()) + } + + fn handle_event( + &mut self, + device_event: u16, + evset: EventSet, + vrings: &[VringRwLock], + thread_id: usize, + ) -> io::Result { + assert!(evset == EventSet::IN); + assert!(vrings.len() == 3); + assert!((device_event as usize) < vrings.len()); + assert!(thread_id == 0); + + let vring = &vrings[device_event as usize]; + match device_event { + REQUEST_QUEUE => { + if self.event_idx { + // vm-virtio's Queue implementation only checks avail_index + // once, so to properly support EVENT_IDX we need to keep + // calling process_request_queue() until it stops finding + // new requests on the queue. + loop { + vring.disable_notification().unwrap(); + self.process_request_queue(vring)?; + if !vring.enable_notification().unwrap() { + break; + } + } + } else { + // Without EVENT_IDX, a single call is enough. + self.process_request_queue(vring)?; + } + } + _ => { + error!("Ignoring descriptor on queue {}", device_event); + } + } + + Ok(false) + } + + fn get_config(&self, _offset: u32, _size: u32) -> Vec { + // QEMU handles config space itself + panic!("Access to configuration space is not supported."); + } + + fn set_config(&mut self, _offset: u32, _buf: &[u8]) -> std::result::Result<(), std::io::Error> { + // QEMU handles config space itself + panic!("Access to configuration space is not supported."); + } + + fn exit_event(&self, _thread_index: usize) -> Option { + Some(self.exit_event.try_clone().expect("Cloning exit eventfd")) + } +} diff --git a/crates/scsi/src/virtio.rs b/crates/scsi/src/virtio.rs new file mode 100644 index 0000000..2f2ecb3 --- /dev/null +++ b/crates/scsi/src/virtio.rs @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: Apache-2.0 or BSD-3-Clause + +//! Helpers for virtio and virtio-scsi. +use std::{ + cell::Cell, + cmp::{max, min}, + convert::TryInto, + io, + io::{ErrorKind, Read, Write}, + mem, + ops::Deref, + rc::Rc, +}; + +use log::error; +use virtio_bindings::virtio_scsi::virtio_scsi_cmd_req; +use virtio_queue::{Descriptor, DescriptorChain, DescriptorChainRwIter}; +use vm_memory::{Bytes, GuestAddress, GuestMemory}; + +/// virtio-scsi has its own format for LUNs, documented in 5.6.6.1 of virtio +/// v1.1. This represents a LUN parsed from that format. +#[derive(PartialEq, Eq, Clone, Copy, Debug)] +pub(crate) enum VirtioScsiLun { + ReportLuns, + TargetLun(u8, u16), +} + +pub(crate) const REPORT_LUNS: [u8; 8] = [0xc1, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0]; + +impl VirtioScsiLun { + pub(crate) const FLAT_SPACE_ADDRESSING_METHOD: u8 = 0b0100_0000; + pub(crate) const ADDRESS_METHOD_PATTERN: u8 = 0b1100_0000; + + pub(crate) fn parse(bytes: [u8; 8]) -> Option { + if bytes == REPORT_LUNS { + Some(Self::ReportLuns) + } else if bytes[0] == 0x1 { + let target = bytes[1]; + // bytes[2..3] is a normal SCSI single-level lun + if (bytes[2] & Self::ADDRESS_METHOD_PATTERN) != Self::FLAT_SPACE_ADDRESSING_METHOD { + error!( + "Got LUN in unsupported format: {:#2x} {:#2x}. \ + Only flat space addressing is supported!", + bytes[2], bytes[3] + ); + return None; + } + + let lun = u16::from_be_bytes([bytes[2] & !Self::ADDRESS_METHOD_PATTERN, bytes[3]]); + Some(Self::TargetLun(target, lun)) + } else { + None + } + } +} + +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum ResponseCode { + Ok = 0, + Overrun = 1, + BadTarget = 3, + Failure = 9, +} + +// These are the defaults given in the virtio spec; QEMU doesn't let the driver +// write to config space, so these will always be the correct values. +pub(crate) const SENSE_SIZE: usize = 96; +pub(crate) const CDB_SIZE: usize = 32; + +pub(crate) struct Request { + pub id: u64, + pub lun: VirtioScsiLun, + pub prio: u8, + pub crn: u8, + pub cdb: [u8; CDB_SIZE], + pub task_attr: u8, +} + +#[derive(Debug)] +pub(crate) enum RequestParseError { + CouldNotReadGuestMemory(io::Error), + FailedParsingLun([u8; 8]), +} + +impl Request { + pub fn parse(reader: &mut impl Read) -> Result { + let mut request = [0; mem::size_of::()]; + + reader + .read_exact(&mut request) + .map_err(RequestParseError::CouldNotReadGuestMemory)?; + + let lun = VirtioScsiLun::parse(request[0..8].try_into().expect("slice is of length 8")) + .ok_or(RequestParseError::FailedParsingLun( + request[0..8].try_into().expect("slice to be of length 8"), + ))?; + + Ok(Self { + id: u64::from_le_bytes(request[8..16].try_into().expect("slice is of length 8")), + lun, + task_attr: request[16], + prio: request[17], + crn: request[18], + cdb: request[19..].try_into().expect("should fit into cdb"), + }) + } +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct Response { + pub response: ResponseCode, + pub status: u8, + pub status_qualifier: u16, + pub sense: Vec, + pub residual: u32, +} + +impl Response { + pub fn write(&self, writer: &mut impl Write) -> Result<(), io::Error> { + writer.write_all(&(self.sense.len() as u32).to_le_bytes())?; // sense_len + writer.write_all(&self.residual.to_le_bytes())?; // residual + writer.write_all(&self.status_qualifier.to_le_bytes())?; // status qual + writer.write_all(&[self.status])?; // status + writer.write_all(&[self.response as u8])?; // response + + writer.write_all(&self.sense[..])?; + + Ok(()) + } + + /// Shortcut to create a response for an error condition, where most fields + /// don't matter. + pub fn error(code: ResponseCode, residual: u32) -> Self { + assert!(code != ResponseCode::Ok); + Self { + response: code, + status: 0, + status_qualifier: 0, + sense: Vec::new(), + residual, + } + } +} + +// TODO: Drop this if https://github.com/rust-vmm/vm-virtio/pull/33 found an agreement +/// A `Write` implementation that writes to the memory indicated by a virtio +/// descriptor chain. +#[derive(Clone)] +pub struct DescriptorChainWriter +where + M::Target: GuestMemory, +{ + chain: DescriptorChain, + iter: DescriptorChainRwIter, + current: Option, + offset: u32, + written: u32, + max_written: Rc>, +} + +impl DescriptorChainWriter +where + M::Target: GuestMemory, +{ + pub fn new(chain: DescriptorChain) -> Self { + let mut iter = chain.clone().writable(); + let current = iter.next(); + Self { + chain, + iter, + current, + offset: 0, + written: 0, + max_written: Rc::new(Cell::new(0)), + } + } + + pub fn skip(&mut self, bytes: u32) { + self.offset += bytes; + self.add_written(bytes); + while self + .current + .map_or(false, |current| self.offset >= current.len()) + { + let current = self.current.expect("loop condition ensures existance"); + self.offset -= current.len(); + self.current = self.iter.next(); + } + } + + pub fn residual(&mut self) -> u32 { + let mut ret = 0; + while let Some(current) = self.current { + ret += current.len() - self.offset; + self.offset = 0; + self.current = self.iter.next(); + } + ret + } + + fn add_written(&mut self, written: u32) { + self.written += written; + self.max_written + .set(max(self.max_written.get(), self.written)); + } + + pub fn max_written(&self) -> u32 { + self.max_written.get() + } +} + +impl Write for DescriptorChainWriter +where + M::Target: GuestMemory, +{ + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if let Some(current) = self.current { + let left_in_descriptor = current.len() - self.offset; + let to_write: u32 = min(left_in_descriptor as usize, buf.len()) as u32; + + let written = self + .chain + .memory() + .write( + &buf[..(to_write as usize)], + GuestAddress(current.addr().0.checked_add(u64::from(self.offset)).ok_or( + io::Error::new(ErrorKind::Other, vm_memory::Error::InvalidGuestRegion), + )?), + ) + .map_err(|e| io::Error::new(ErrorKind::Other, e))?; + + self.offset += written as u32; + + if self.offset == current.len() { + self.current = self.iter.next(); + self.offset = 0; + } + + self.add_written(written as u32); + + Ok(written) + } else { + Ok(0) + } + } + + fn flush(&mut self) -> std::io::Result<()> { + // no-op: we're writing directly to guest memory + Ok(()) + } +} + +/// A `Read` implementation that reads from the memory indicated by a virtio +/// descriptor chain. +pub struct DescriptorChainReader +where + M::Target: GuestMemory, +{ + chain: DescriptorChain, + iter: DescriptorChainRwIter, + current: Option, + offset: u32, +} + +impl DescriptorChainReader +where + M::Target: GuestMemory, +{ + pub fn new(chain: DescriptorChain) -> Self { + let mut iter = chain.clone().readable(); + let current = iter.next(); + + Self { + chain, + iter, + current, + offset: 0, + } + } +} + +impl Read for DescriptorChainReader +where + M::Target: GuestMemory, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if let Some(current) = self.current { + let left_in_descriptor = current.len() - self.offset; + let to_read = min(left_in_descriptor, buf.len() as u32); + + let read = self + .chain + .memory() + .read( + &mut buf[..(to_read as usize)], + GuestAddress(current.addr().0 + u64::from(self.offset)), + ) + .map_err(|e| io::Error::new(ErrorKind::Other, e))?; + + self.offset += read as u32; + + if self.offset == current.len() { + self.current = self.iter.next(); + self.offset = 0; + } + + Ok(read) + } else { + Ok(0) + } + } +}