blob: 3c80ecf1d978966f493917c5ef619e33999f3e95 [file] [log] [blame]
#LyX 2.0 created this file. For more info see http://www.lyx.org/
\lyxformat 413
\begin_document
\begin_header
\textclass report
\use_default_options false
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman default
\font_sans default
\font_typewriter default
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100
\font_tt_scale 100
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_amsmath 1
\use_esint 1
\use_mhchem 1
\use_mathdots 1
\cite_engine basic
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\use_refstyle 0
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation skip
\defskip medskip
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes true
\output_changes true
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header
\begin_body
\begin_layout Title
Virtio PCI Card Specification
\begin_inset Newline newline
\end_inset
v0.9.5 DRAFT
\begin_inset Newline newline
\end_inset
-
\end_layout
\begin_layout Author
Rusty Russell <rusty@rustcorp.com.au> IBM Corporation (Editor)
\end_layout
\begin_layout Date
2012 May 7.
\end_layout
\begin_layout Chapter
Purpose and Description
\end_layout
\begin_layout Standard
This document describes the specifications of the
\begin_inset Quotes eld
\end_inset
virtio
\begin_inset Quotes erd
\end_inset
family of
\emph on
PCI
\emph default
\begin_inset CommandInset nomenclature
LatexCommand nomenclature
symbol "PCI"
description "Peripheral Component Interconnect; a common device bus. See\\\\http://en.wikipedia.org/wiki/Peripheral Component Interconnect"
\end_inset
devices.
These are devices are found in
\emph on
virtual
\emph default
\emph on
environments
\begin_inset CommandInset nomenclature
LatexCommand nomenclature
symbol "virtualized"
description "Environments where access to hardware is restricted (and often emulated) by a hypervisor."
\end_inset
\emph default
, yet by design they are not all that different from physical PCI devices,
and this document treats them as such.
This allows the guest to use standard PCI drivers and discovery mechanisms.
\end_layout
\begin_layout Standard
The purpose of virtio and this specification is that virtual environments
and guests should have a straightforward, efficient, standard and extensible
mechanism for virtual devices, rather than boutique per-environment or
per-OS mechanisms.
\end_layout
\begin_layout Description
Straightforward: Virtio PCI devices use normal PCI mechanisms of interrupts
and DMA which should be familiar to any device driver author.
There is no exotic page-flipping or COW mechanism: it's just a PCI device.
\begin_inset Foot
status open
\begin_layout Plain Layout
This lack of page-sharing implies that the implementation of the device
(e.g.
the hypervisor or host) needs full access to the guest memory.
Communication with untrusted parties (i.e.
inter-guest communication) requires copying.
\end_layout
\end_inset
\end_layout
\begin_layout Description
Efficient: Virtio PCI devices consist of rings of descriptors for input
and output, which are neatly separated to avoid cache effects from both
guest and device writing to the same cache lines.
\end_layout
\begin_layout Description
Standard: Virtio PCI makes no assumptions about the environment in which
it operates, beyond supporting PCI.
In fact the virtio devices specified in the appendices do not require PCI
at all: they have been implemented on non-PCI buses.
\begin_inset Foot
status open
\begin_layout Plain Layout
The Linux implementation further separates the PCI virtio code from the
specific virtio drivers: these drivers are shared with the non-PCI implementati
ons (currently lguest and S/390).
\end_layout
\end_inset
\end_layout
\begin_layout Description
Extensible: Virtio PCI devices contain feature bits which are acknowledged
by the guest operating system during device setup.
This allows forwards and backwards compatibility: the device offers all
the features it knows about, and the driver acknowledges those it understands
and wishes to use.
\end_layout
\begin_layout Section
Virtqueues
\end_layout
\begin_layout Standard
The mechanism for bulk data transport on virtio PCI devices is pretentiously
called a virtqueue.
Each device can have zero or more virtqueues: for example, the network
device has one for transmit and one for receive.
\end_layout
\begin_layout Standard
Each virtqueue occupies two or more physically-contiguous pages (defined,
for the purposes of this specification, as 4096 bytes), and consists of
three parts:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="1" columns="4">
<features tabularvalignment="middle">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Descriptor Table
\end_layout
\end_inset
</cell>
<cell multicolumn="1" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Available Ring
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\emph on
(padding)
\end_layout
\end_inset
</cell>
<cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Used Ring
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
When the driver wants to send a buffer to the device, it fills in a slot
in the descriptor table (or chains several together), and writes the descriptor
index into the available ring.
It then notifies the device.
When the device has finished a buffer, it writes the descriptor into the
used ring, and sends an interrupt.
\end_layout
\begin_layout Chapter
Specification
\end_layout
\begin_layout Section
PCI Discovery
\end_layout
\begin_layout Standard
Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 through 0x103F
inclusive is a virtio device
\begin_inset Foot
status open
\begin_layout Plain Layout
The actual value within this range is ignored
\end_layout
\end_inset
.
The device must also have a Revision ID of 0 to match this specification.
\end_layout
\begin_layout Standard
The Subsystem Device ID indicates which virtio device is supported by the
device.
The Subsystem Vendor ID should reflect the PCI Vendor ID of the environment
(it's currently only used for informational purposes by the guest).
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="11" columns="3">
<features tabularvalignment="bottom">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="bottom" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Subsystem Device ID
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Virtio Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Specification
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
1
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
network card
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix C
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
2
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
block device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix D
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
3
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
console
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix E
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
4
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
entropy source
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix F
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
5
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
memory ballooning
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix G
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
6
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
ioMemory
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
-
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
7
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
rpmsg
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix H
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
8
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
SCSI host
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix I
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
9
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
9P transport
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
-
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
10
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
mac80211 wlan
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
-
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Section
Device Configuration
\end_layout
\begin_layout Standard
To configure the device, we use the first I/O region of the PCI device.
This contains a
\emph on
virtio header
\emph default
followed by a
\emph on
device-specific region.
\end_layout
\begin_layout Standard
There may be different widths of accesses to the I/O region; the
\begin_inset Quotes eld
\end_inset
natural
\begin_inset Quotes erd
\end_inset
access method for each field in the virtio header must be used (i.e.
32-bit accesses for 32-bit fields, etc), but the device-specific region
can be accessed using any width accesses, and should obtain the same results.
\end_layout
\begin_layout Standard
Note that this is possible because while the virtio header is PCI (i.e.
little) endian, the device-specific region is encoded in the native endian
of the guest (where such distinction is applicable).
\end_layout
\begin_layout Subsection
Device Initialization Sequence
\begin_inset CommandInset label
LatexCommand label
name "sub:Device-Initialization-Sequence"
\end_inset
\end_layout
\begin_layout Standard
We start with an overview of device initialization, then expand on the details
of the device and how each step is preformed.
\end_layout
\begin_layout Enumerate
Reset the device.
This is not required on initial start up.
\end_layout
\begin_layout Enumerate
The ACKNOWLEDGE status bit is set: we have noticed the device.
\end_layout
\begin_layout Enumerate
The DRIVER status bit is set: we know how to drive the device.
\end_layout
\begin_layout Enumerate
Device-specific setup, including reading the Device Feature Bits, discovery
of virtqueues for the device, optional MSI-X setup, and reading and possibly
writing the virtio configuration space.
\end_layout
\begin_layout Enumerate
The subset of Device Feature Bits understood by the driver is written to
the device.
\end_layout
\begin_layout Enumerate
The DRIVER_OK status bit is set.
\end_layout
\begin_layout Enumerate
The device can now be used (ie.
buffers added to the virtqueues)
\begin_inset Foot
status open
\begin_layout Plain Layout
Historically, drivers have used the device before steps 5 and 6.
This is only allowed if the driver does not use any features which would
alter this early use of the device.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
If any of these steps go irrecoverably wrong, the guest should set the FAILED
status bit to indicate that it has given up on the device (it can reset
the device later to restart if desired).
\end_layout
\begin_layout Standard
We now cover the fields required for general setup in detail.
\end_layout
\begin_layout Subsection
Virtio Header
\end_layout
\begin_layout Standard
The virtio header looks as follows:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="9">
<features tabularvalignment="middle">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
8
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
8
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Guest
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
ISR
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Features bits 0:31
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Features bits 0:31
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Address
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Size
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Select
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Notify
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Status
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Status
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
If MSI-X is enabled for the device, two additional fields immediately follow
this header:
\begin_inset Foot
status collapsed
\begin_layout Plain Layout
ie.
once you enable MSI-X on the device, the other fields move.
If you turn it off again, they move back!
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="3">
<features tabularvalignment="middle">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Configuration
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
(MSI-X)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Vector
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Vector
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
Immediately following these general headers, there may be device-specific
headers:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="2">
<features tabularvalignment="middle">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Device Specific
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Device Specific
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device Specific...
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Subsubsection
Device Status
\end_layout
\begin_layout Standard
The Device Status field is updated by the guest to indicate its progress.
This provides a simple low-level diagnostic: it's most useful to imagine
them hooked up to traffic lights on the console indicating the status of
each device.
\end_layout
\begin_layout Standard
The device can be reset by writing a 0 to this field, otherwise at least
one bit should be set:
\end_layout
\begin_layout Description
ACKNOWLEDGE
\begin_inset space ~
\end_inset
(1) Indicates that the guest OS has found the device and recognized it as
a valid virtio device.
\end_layout
\begin_layout Description
DRIVER
\begin_inset space ~
\end_inset
(2) Indicates that the guest OS knows how to drive the device.
Under Linux, drivers can be loadable modules so there may be a significant
(or infinite) delay before setting this bit.
\end_layout
\begin_layout Description
DRIVER_OK
\begin_inset space ~
\end_inset
(4) Indicates that the driver is set up and ready to drive the device.
\end_layout
\begin_layout Description
FAILED
\begin_inset space ~
\end_inset
(128) Indicates that something went wrong in the guest, and it has given
up on the device.
This could be an internal error, or the driver didn't like the device for
some reason, or even a fatal error during device operation.
The device must be reset before attempting to re-initialize.
\end_layout
\begin_layout Subsubsection
Feature Bits
\begin_inset CommandInset label
LatexCommand label
name "sub:Feature-Bits"
\end_inset
\end_layout
\begin_layout Standard
Thefirst configuration field indicates the features that the device supports.
The bits are allocated as follows:
\end_layout
\begin_layout Description
0
\begin_inset space ~
\end_inset
to
\begin_inset space ~
\end_inset
23 Feature bits for the specific device type
\end_layout
\begin_layout Description
24
\begin_inset space \space{}
\end_inset
to
\begin_inset space ~
\end_inset
32 Feature bits reserved for extensions to the queue and feature negotiation
mechanisms
\end_layout
\begin_layout Standard
For example, feature bit 0 for a network device (i.e.
Subsystem Device ID 1) indicates that the device supports checksumming
of packets.
\end_layout
\begin_layout Standard
The feature bits are
\emph on
negotiated:
\emph default
the device lists all the features it understands in the Device Features
field, and the guest writes the subset that it understands into the Guest
Features field.
The only way to renegotiate is to reset the device.
\end_layout
\begin_layout Standard
In particular, new fields in the device configuration header are indicated
by offering a feature bit, so the guest can check before accessing that
part of the configuration space.
\end_layout
\begin_layout Standard
This allows for forwards and backwards compatibility: if the device is enhanced
with a new feature bit, older guests will not write that feature bit back
to the Guest Features field and it can go into backwards compatibility
mode.
Similarly, if a guest is enhanced with a feature that the device doesn't
support, it will not see that feature bit in the Device Features field
and can go into backwards compatibility mode (or, for poor implementations,
set the FAILED Device Status bit).
\end_layout
\begin_layout Subsubsection
Configuration/Queue Vectors
\end_layout
\begin_layout Standard
When MSI-X capability is present and enabled in the device (through standard
PCI configuration space) 4 bytes at byte offset 20 are used to map configuratio
n change and queue interrupts to MSI-X vectors.
In this case, the ISR Status field is unused, and device specific configuration
starts at byte offset 24 in virtio header structure.
When MSI-X capability is not enabled, device specific configuration starts
at byte offset 20 in virtio header.
\end_layout
\begin_layout Standard
Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of Configuration/Qu
eue Vector registers,
\emph on
maps
\emph default
interrupts triggered by the configuration change/selected queue events
respectively to the corresponding MSI-X vector.
To disable interrupts for a specific event type, unmap it by writing a
special NO_VECTOR value:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
/* Vector value used to disable MSI for queue */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_MSI_NO_VECTOR 0xffff
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Reading these registers returns vector mapped to a given event, or NO_VECTOR
if unmapped.
All queue and configuration change events are unmapped by default.
\end_layout
\begin_layout Standard
Note that mapping an event to vector might require allocating internal device
resources, and might fail.
Devices report such failures by returning the NO_VECTOR value when the
relevant Vector field is read.
After mapping an event to vector, the driver must verify success by reading
the Vector field value: on success, the previously written value is returned,
and on failure, NO_VECTOR is returned.
If a mapping failure is detected, the driver can retry mapping with fewervector
s, or disable MSI-X.
\end_layout
\begin_layout Section
Virtqueue Configuration
\begin_inset CommandInset label
LatexCommand label
name "sec:Virtqueue-Configuration"
\end_inset
\end_layout
\begin_layout Standard
As a device can have zero or more virtqueues for bulk data transport (for
example, the network driver has two), the driver needs to configure them
as part of the device-specific configuration.
\end_layout
\begin_layout Standard
This is done as follows, for each virtqueue a device has:
\end_layout
\begin_layout Enumerate
Write the virtqueue index (first queue is 0) to the Queue Select field.
\end_layout
\begin_layout Enumerate
Read the virtqueue size from the Queue Size field, which is always a power
of 2.
This controls how big the virtqueue is (see below).
If this field is 0, the virtqueue does not exist.
\end_layout
\begin_layout Enumerate
Allocate and zero virtqueue in contiguous physical memory, on a 4096 byte
alignment.
Write the physical address, divided by 4096 to the Queue Address field.
\begin_inset Foot
status open
\begin_layout Plain Layout
The 4096 is based on the x86 page size, but it's also large enough to ensure
that the separate parts of the virtqueue are on separate cache lines.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
Optionally, if MSI-X capability is present and enabled on the device, select
a vector to use to request interrupts triggered by virtqueue events.
Write the MSI-X Table entry number corresponding to this vector in Queue
Vector field.
Read the Queue Vector field: on success, previously written value is returned;
on failure, NO_VECTOR value is returned.
\end_layout
\begin_layout Standard
The Queue Size field controls the total number of bytes required for the
virtqueue according to the following formula:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define ALIGN(x) (((x) + 4095) & ~4095)
\end_layout
\begin_layout Plain Layout
static inline unsigned vring_size(unsigned int qsz)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 + qsz))
\end_layout
\begin_layout Plain Layout
+ ALIGN(sizeof(struct vring_used_elem)*qsz);
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_layout Standard
This currently wastes some space with padding, but also allows future extensions.
The virtqueue layout structure looks like this (qsz is the Queue Size field,
which is a variable, so this code won't compile):
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring {
\end_layout
\begin_layout Plain Layout
/* The actual descriptors (16 bytes each) */
\end_layout
\begin_layout Plain Layout
struct vring_desc desc[qsz];
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* A ring of available descriptor heads with free-running index.
*/
\end_layout
\begin_layout Plain Layout
struct vring_avail avail;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
// Padding to the next 4096 boundary.
\end_layout
\begin_layout Plain Layout
char pad[];
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
// A ring of used descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
struct vring_used used;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
A Note on Virtqueue Endianness
\end_layout
\begin_layout Standard
Note that the
\emph on
endian
\emph default
of these fields and everything else in the virtqueue is the native endian
of the guest, not little-endian as PCI normally is.
This makes for simpler guest code, and it is assumed that the host already
has to be deeply aware of the guest endian so such an
\begin_inset Quotes eld
\end_inset
endian-aware
\begin_inset Quotes erd
\end_inset
device is not a significant issue.
\end_layout
\begin_layout Subsection
Descriptor Table
\end_layout
\begin_layout Standard
The descriptor table refers to the buffers the guest is using for the device.
The addresses are physical addresses, and the buffers can be chained via
the next field.
Each descriptor describes a buffer which is read-only or write-only, but
a chain of descriptors can contain both read-only and write-only buffers.
\end_layout
\begin_layout Standard
No descriptor chain may be more than 2^32 bytes long in total.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring_desc {
\end_layout
\begin_layout Plain Layout
/* Address (guest-physical).
*/
\end_layout
\begin_layout Plain Layout
u64 addr;
\end_layout
\begin_layout Plain Layout
/* Length.
*/
\end_layout
\begin_layout Plain Layout
u32 len;
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as continuing via the next field.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_NEXT 1
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as write-only (otherwise read-only).
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_WRITE 2
\end_layout
\begin_layout Plain Layout
/* This means the buffer contains a list of buffer descriptors.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_INDIRECT 4
\end_layout
\begin_layout Plain Layout
/* The flags as indicated above.
*/
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
/* Next field if flags & NEXT */
\end_layout
\begin_layout Plain Layout
u16 next;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The number of descriptors in the table is specified by the Queue Size field
for this virtqueue.
\end_layout
\begin_layout Subsection
\begin_inset CommandInset label
LatexCommand label
name "sub:Indirect-Descriptors"
\end_inset
Indirect Descriptors
\end_layout
\begin_layout Standard
Some devices benefit by concurrently dispatching a large number of large
requests.
The VIRTIO_RING_F_INDIRECT_DESC feature can be used to allow this (see
\begin_inset CommandInset ref
LatexCommand ref
reference "cha:Reserved-Feature-Bits"
\end_inset
).
To increase ring capacity it is possible to store a table of
\emph on
indirect descriptors
\emph default
anywhere in memory, and insert a descriptor in main virtqueue (with flags&INDIR
ECT on) that refers to memory buffer containing this
\emph on
indirect descriptor table
\emph default
; fields
\emph on
addr
\emph default
and
\emph on
len
\emph default
refer to the indirect table address and length in bytes, respectively.
The indirect table layout structure looks like this (len is the length
of the descriptor that refers to this table, which is a variable, so this
code won't compile):
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct indirect_descriptor_table {
\end_layout
\begin_layout Plain Layout
/* The actual descriptors (16 bytes each) */
\end_layout
\begin_layout Plain Layout
struct vring_desc desc[len / 16];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The first indirect descriptor is located at start of the indirect descriptor
table (index 0), additional indirect descriptors are chained by next field.
An indirect descriptor without next field (with flags&NEXT off) signals
the end of the indirect descriptor table, and transfers control back to
the main virtqueue.
An indirect descriptor can not refer to another indirect descriptor table
(flags&INDIRECT must be off).
A single indirect descriptor table can include both read-only and write-only
descriptors; write-only flag (flags&WRITE) in the descriptor that refers
to it is ignored.
\end_layout
\begin_layout Subsection
Available Ring
\end_layout
\begin_layout Standard
The available ring refers to what descriptors we are offering the device:
it refers to the head of a descriptor chain.
The
\begin_inset Quotes eld
\end_inset
flags
\begin_inset Quotes erd
\end_inset
field is currently 0 or 1: 1 indicating that we do not need an interrupt
when the device consumes a descriptor from the available ring.
Alternatively, the guest can ask the device to delay interrupts until an
entry with an index specified by the
\begin_inset Quotes eld
\end_inset
used_event
\begin_inset Quotes erd
\end_inset
field is written in the used ring (equivalently, until the
\emph on
idx
\emph default
field in the used ring will reach the value
\emph on
used_event + 1
\emph default
).
The method employed by the device is controlled by the VIRTIO_RING_F_EVENT_IDX
feature bit (see
\begin_inset CommandInset ref
LatexCommand ref
reference "cha:Reserved-Feature-Bits"
\end_inset
).
This interrupt suppression is merely an optimization; it may not suppress
interrupts entirely.
\end_layout
\begin_layout Standard
The
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field indicates where we would put the
\emph on
next
\emph default
descriptor entry (modulo the ring size).
This starts at 0, and increases.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring_avail {
\end_layout
\begin_layout Plain Layout
#define VRING_AVAIL_F_NO_INTERRUPT 1
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
u16 idx;
\end_layout
\begin_layout Plain Layout
u16 ring[qsz]; /* qsz is the Queue Size field read from device */
\end_layout
\begin_layout Plain Layout
u16 used_event;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Used Ring
\end_layout
\begin_layout Standard
The used ring is where the device returns buffers once it is done with them.
The flags field can be used by the device to hint that no notification
is necessary when the guest adds to the
\emph on
available
\emph default
ring.
Alternatively, the
\begin_inset Quotes eld
\end_inset
avail_event
\begin_inset Quotes erd
\end_inset
field can be used by the device to hint that no notification is necessary
until an entry with an index specified by the
\begin_inset Quotes eld
\end_inset
avail_event
\begin_inset Quotes erd
\end_inset
is written in the available ring (equivalently, until the
\emph on
idx
\emph default
field in the available ring will reach the value
\emph on
avail_event + 1
\emph default
).
The method employed by the device is controlled by the guest through the
VIRTIO_RING_F_EVENT_IDX feature bit (see
\begin_inset CommandInset ref
LatexCommand ref
reference "cha:Reserved-Feature-Bits"
\end_inset
).
\begin_inset Foot
status open
\begin_layout Plain Layout
These fields are kept here because this is the only part of the virtqueue
written by the device
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
Each entry in the ring is a pair: the head entry of the descriptor chain
describing the buffer (this matches an entry placed in the available ring
by the guest earlier), and the total of bytes written into the buffer.
The latter is extremely useful for guests using untrusted buffers: if you
do not know exactly how much has been written by the device, you usually
have to zero the buffer to ensure no data leakage occurs.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
/* u32 is used here for ids for padding reasons.
*/
\end_layout
\begin_layout Plain Layout
struct vring_used_elem {
\end_layout
\begin_layout Plain Layout
/* Index of start of used descriptor chain.
*/
\end_layout
\begin_layout Plain Layout
u32 id;
\end_layout
\begin_layout Plain Layout
/* Total length of the descriptor chain which was used (written to)
*/
\end_layout
\begin_layout Plain Layout
u32 len;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_used {
\end_layout
\begin_layout Plain Layout
#define VRING_USED_F_NO_NOTIFY 1
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
u16 idx;
\end_layout
\begin_layout Plain Layout
struct vring_used_elem ring[qsz];
\end_layout
\begin_layout Plain Layout
u16 avail_event;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Helpers for Managing Virtqueues
\end_layout
\begin_layout Standard
The Linux Kernel Source code contains the definitions above and helper routines
in a more usable form, in include/linux/virtio_ring.h.
This was explicitly licensed by IBM and Red Hat under the (3-clause) BSD
license so that it can be freely used by all other projects, and is reproduced
(with slight variation to remove Linux assumptions) in Appendix A.
\end_layout
\begin_layout Section
Device Operation
\begin_inset CommandInset label
LatexCommand label
name "sec:Device-Operation"
\end_inset
\end_layout
\begin_layout Standard
There are two parts to device operation: supplying new buffers to the device,
and processing used buffers from the device.
As an example, the virtio network device has two virtqueues: the transmit
virtqueue and the receive virtqueue.
The driver adds outgoing (read-only) packets to the transmit virtqueue,
and then frees them after they are used.
Similarly, incoming (write-only) buffers are added to the receive virtqueue,
and processed after they are used.
\end_layout
\begin_layout Subsection
Supplying Buffers to The Device
\end_layout
\begin_layout Standard
Actual transfer of buffers from the guest OS to the device operates as follows:
\end_layout
\begin_layout Enumerate
Place the buffer(s) into free descriptor(s).
\end_layout
\begin_deeper
\begin_layout Enumerate
If there are no free descriptors, the guest may choose to notify the device
even if notifications are suppressed (to reduce latency).
\begin_inset Foot
status open
\begin_layout Plain Layout
The Linux drivers do this only for read-only buffers: for write-only buffers,
it is assumed that the driver is merely trying to keep the receive buffer
ring full, and no notification of this expected condition is necessary.
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
Place the id of the buffer in the next ring entry of the available ring.
\end_layout
\begin_layout Enumerate
The steps (1) and (2) may be performed repeatedly if batching is possible.
\end_layout
\begin_layout Enumerate
A memory barrier should be executed to ensure the device sees the updated
descriptor table and available ring before the next step.
\end_layout
\begin_layout Enumerate
The available
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field should be increased by the number of entries added to the available
ring.
\end_layout
\begin_layout Enumerate
A memory barrier should be executed to ensure that we update the idx field
before checking for notification suppression.
\end_layout
\begin_layout Enumerate
If notifications are not suppressed, the device should be notified of the
new buffers.
\end_layout
\begin_layout Standard
Note that the above code does not take precautions against the available
ring buffer wrapping around: this is not possible since the ring buffer
is the same size as the descriptor table, so step (1) will prevent such
a condition.
\end_layout
\begin_layout Standard
In addition, the maximum queue size is 32768 (it must be a power of 2 which
fits in 16 bits), so the 16-bit
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
value can always distinguish between a full and empty buffer.
\end_layout
\begin_layout Standard
Here is a description of each stage in more detail.
\end_layout
\begin_layout Subsubsection
Placing Buffers Into The Descriptor Table
\end_layout
\begin_layout Standard
A buffer consists of zero or more read-only physically-contiguous elements
followed by zero or more physically-contiguous write-only elements (it
must have at least one element).
This algorithm maps it into the descriptor table:
\end_layout
\begin_layout Enumerate
for each buffer element,
\family typewriter
b
\family default
:
\end_layout
\begin_deeper
\begin_layout Enumerate
Get the next free descriptor table entry,
\family typewriter
d
\end_layout
\begin_layout Enumerate
Set
\family typewriter
d.addr
\family default
to the physical address of the start of
\family typewriter
b
\end_layout
\begin_layout Enumerate
Set
\family typewriter
d.len
\family default
to the length of
\family typewriter
b
\family default
.
\end_layout
\begin_layout Enumerate
If
\family typewriter
b
\family default
is write-only, set
\family typewriter
d.flags
\family default
to VRING_DESC_F_WRITE, otherwise 0.
\end_layout
\begin_layout Enumerate
If there is a buffer element after this:
\end_layout
\begin_deeper
\begin_layout Enumerate
Set
\family typewriter
d.next
\family default
to the index of the next free descriptor element.
\end_layout
\begin_layout Enumerate
Set the VRING_DESC_F_NEXT bit in
\family typewriter
d.flags
\family default
.
\end_layout
\end_deeper
\end_deeper
\begin_layout Standard
In practice, the d.next fields are usually used to chain free descriptors,
and a separate count kept to check there are enough free descriptors before
beginning the mappings.
\end_layout
\begin_layout Subsubsection
Updating The Available Ring
\end_layout
\begin_layout Standard
The head of the buffer we mapped is the first
\family typewriter
d
\family default
in the algorithm above.
A naive implementation would do the following:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->ring[avail->idx % qsz] = head;
\end_layout
\end_inset
\end_layout
\begin_layout Standard
However, in general we can add many descriptors before we update the
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field (at which point they become visible to the device), so we keep a
counter of how many we've added:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->ring[(avail->idx + added++) % qsz] = head;
\end_layout
\end_inset
\end_layout
\begin_layout Subsubsection
Updating The Index Field
\end_layout
\begin_layout Standard
Once the idx field of the virtqueue is updated, the device will be able
to access the descriptor entries we've created and the memory they refer
to.
This is why a memory barrier is generally used before the idx update, to
ensure it sees the most up-to-date copy.
\end_layout
\begin_layout Standard
The idx field always increments, and we let it wrap naturally at 65536:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->idx += added;
\end_layout
\end_inset
\end_layout
\begin_layout Subsubsection
\begin_inset CommandInset label
LatexCommand label
name "sub:Notifying-The-Device"
\end_inset
Notifying The Device
\end_layout
\begin_layout Standard
Device notification occurs by writing the 16-bit virtqueue index of this
virtqueue to the Queue Notify field of the virtio header in the first I/O
region of the PCI device.
This can be expensive, however, so the device can suppress such notifications
if it doesn't need them.
We have to be careful to expose the new idx value
\emph on
before
\emph default
checking the suppression flag: it's OK to notify gratuitously, but not
to omit a required notification.
So again, we use a memory barrier here before reading the flags or the
avail_event field.
\end_layout
\begin_layout Standard
If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if the VRING_USED_
F_NOTIFY flag is not set, we go ahead and write to the PCI configuration
space.
\end_layout
\begin_layout Standard
If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the avail_event
field in the available ring structure.
If the available index crossed_the
\emph on
avail_event
\emph default
field value since the last notification, we go ahead and write to the PCI
configuration space.
The
\emph on
avail_event
\emph default
field wraps naturally at 65536 as well:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx)
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
\begin_inset CommandInset label
LatexCommand label
name "sub:Receiving-Used-Buffers"
\end_inset
Receiving Used Buffers From The Device
\end_layout
\begin_layout Standard
Once the device has used a buffer (read from or written to it, or parts
of both, depending on the nature of the virtqueue and the device), it sends
an interrupt, following an algorithm very similar to the algorithm used
for the driver to send the device a buffer:
\end_layout
\begin_layout Enumerate
Write the head descriptor number to the next field in the used ring.
\end_layout
\begin_layout Enumerate
Update the used ring idx.
\end_layout
\begin_layout Enumerate
Determine whether an interrupt is necessary:
\end_layout
\begin_deeper
\begin_layout Enumerate
If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check if f the
VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail\SpecialChar \nobreakdash-
>flags
\end_layout
\begin_layout Enumerate
If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check whether the
used index crossed the
\emph on
used_event
\emph default
field value since the last update.
The
\emph on
used_event
\emph default
field wraps naturally at 65536 as well:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx)
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
If an interrupt is necessary:
\end_layout
\begin_deeper
\begin_layout Enumerate
If MSI-X capability is disabled:
\end_layout
\begin_deeper
\begin_layout Enumerate
Set the lower bit of the ISR Status field for the device.
\end_layout
\begin_layout Enumerate
Send the appropriate PCI interrupt for the device.
\end_layout
\end_deeper
\begin_layout Enumerate
If MSI-X capability is enabled:
\end_layout
\begin_deeper
\begin_layout Enumerate
Request the appropriate MSI-X interrupt message for the device, Queue Vector
field sets the MSI-X Table entry number.
\end_layout
\begin_layout Enumerate
If Queue Vector field value is NO_VECTOR, no interrupt message is requested
for this event.
\end_layout
\end_deeper
\end_deeper
\begin_layout Standard
The guest interrupt handler should:
\end_layout
\begin_layout Enumerate
If MSI-X capability is disabled: read the ISR Status field, which will reset
it to zero.
If the lower bit is zero, the interrupt was not for this device.
Otherwise, the guest driver should look through the used rings of each
virtqueue for the device, to see if any progress has been made by the device
which requires servicing.
\end_layout
\begin_layout Enumerate
If MSI-X capability is enabled: look through the used rings of each virtqueue
mapped to the specific MSI-X vector for the device, to see if any progress
has been made by the device which requires servicing.
\end_layout
\begin_layout Standard
For each ring, guest should then disable interrupts by writing VRING_AVAIL_F_NO_
INTERRUPT flag in avail structure, if required.
It can then process used ring entries finally enabling interrupts by clearing
the VRING_AVAIL_F_NO_INTERRUPT flag or updating the EVENT_IDX field in
the available structure, Guest should then execute a memory barrier, and
then recheck the ring empty condition.
This is necessary to handle the case where, after the last check and before
enabling interrupts, an interrupt has been suppressed by the device:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
vring_disable_interrupts(vq);
\end_layout
\begin_layout Plain Layout
for (;;) {
\end_layout
\begin_layout Plain Layout
if (vq->last_seen_used != vring->used.idx) {
\end_layout
\begin_layout Plain Layout
vring_enable_interrupts(vq);
\end_layout
\begin_layout Plain Layout
mb();
\end_layout
\begin_layout Plain Layout
if (vq->last_seen_used != vring->used.idx)
\end_layout
\begin_layout Plain Layout
break;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
struct vring_used_elem *e = vring.used->ring[vq->last_seen_used%vsz];
\end_layout
\begin_layout Plain Layout
process_buffer(e);
\end_layout
\begin_layout Plain Layout
vq->last_seen_used++;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Dealing With Configuration Changes
\begin_inset CommandInset label
LatexCommand label
name "sub:Dealing-With-Configuration"
\end_inset
\end_layout
\begin_layout Standard
Some virtio PCI devices can change the device configuration state, as reflected
in the virtio header in the PCI configuration space.
In this case:
\end_layout
\begin_layout Enumerate
If MSI-X capability is disabled: an interrupt is delivered and the second
highest bit is set in the ISR Status field to indicate that the driver
should re-examine the configuration space.Note that a single interrupt can
indicate both that one or more virtqueue has been used and that the configurati
on space has changed: even if the config bit is set, virtqueues must be
scanned.
\end_layout
\begin_layout Enumerate
If MSI-X capability is enabled: an interrupt message is requested.
The Configuration Vector field sets the MSI-X Table entry number to use.
If Configuration Vector field value is NO_VECTOR, no interrupt message
is requested for this event.
\end_layout
\begin_layout Chapter
Creating New Device Types
\end_layout
\begin_layout Standard
Various considerations are necessary when creating a new device type:
\end_layout
\begin_layout Section*
How Many Virtqueues?
\end_layout
\begin_layout Standard
It is possible that a very simple device will operate entirely through its
configuration space, but most will need at least one virtqueue in which
it will place requests.
A device with both input and output (eg.
console and network devices described here) need two queues: one which
the driver fills with buffers to receive input, and one which the driver
places buffers to transmit output.
\end_layout
\begin_layout Section*
What Configuration Space Layout?
\end_layout
\begin_layout Standard
Configuration space is generally used for rarely-changing or initialization-time
parameters.
But it is a limited resource, so it might be better to use a virtqueue
to update configuration information (the network device does this for filtering
, otherwise the table in the config space could potentially be very large).
\end_layout
\begin_layout Standard
Note that this space is generally the guest's native endian, rather than
PCI's little-endian.
\end_layout
\begin_layout Section*
What Device Number?
\end_layout
\begin_layout Standard
Currently device numbers are assigned quite freely: a simple request mail
to the author of this document or the Linux virtualization mailing list
\begin_inset Foot
status open
\begin_layout Plain Layout
https://lists.linux-foundation.org/mailman/listinfo/virtualization
\end_layout
\end_inset
will be sufficient to secure a unique one.
\end_layout
\begin_layout Standard
Meanwhile for experimental drivers, use 65535 and work backwards.
\end_layout
\begin_layout Section*
How many MSI-X vectors?
\end_layout
\begin_layout Standard
Using the optional MSI-X capability devices can speed up interrupt processing
by removing the need to read ISR Status register by guest driver (which
might be an expensive operation), reducing interrupt sharing between devices
and queues within the device, and handling interrupts from multiple CPUs.
However, some systems impose a limit (which might be as low as 256) on
the total number of MSI-X vectors that can be allocated to all devices.
Devices and/or device drivers should take this into account, limiting the
number of vectors used unless the device is expected to cause a high volume
of interrupts.
Devices can control the number of vectors used by limiting the MSI-X Table
Size or not presenting MSI-X capability in PCI configuration space.
Drivers can control this by mapping events to as small number of vectors
as possible, or disabling MSI-X capability altogether.
\end_layout
\begin_layout Section*
Message Framing
\end_layout
\begin_layout Standard
The descriptors used for a buffer should not effect the semantics of the
message, except for the total length of the buffer.
For example, a network buffer consists of a 10 byte header followed by
the network packet.
Whether this is presented in the ring descriptor chain as (say) a 10 byte
buffer and a 1514 byte buffer, or a single 1524 byte buffer, or even three
buffers, should have no effect.
\end_layout
\begin_layout Standard
In particular, no implementation should use the descriptor boundaries to
determine the size of any header in a request.
\begin_inset Foot
status open
\begin_layout Plain Layout
The current qemu device implementations mistakenly insist that the first
descriptor cover the header in these cases exactly, so a cautious driver
should arrange it so.
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Improvements
\end_layout
\begin_layout Standard
Any change to configuration space, or new virtqueues, or behavioural changes,
should be indicated by negotiation of a new feature bit.
This establishes clarity
\begin_inset Foot
status open
\begin_layout Plain Layout
Even if it does mean documenting design or implementation mistakes!
\end_layout
\end_inset
and avoids future expansion problems.
\end_layout
\begin_layout Standard
Clusters of functionality which are always implemented together can use
a single bit, but if one feature makes sense without the others they should
not be gratuitously grouped together to conserve feature bits.
We can always extend the spec when the first person needs more than 24
feature bits for their device.
\end_layout
\begin_layout Standard
\begin_inset CommandInset nomencl_print
LatexCommand printnomenclature
set_width "none"
\end_inset
\end_layout
\begin_layout Chapter*
Appendix A: virtio_ring.h
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#ifndef VIRTIO_RING_H
\end_layout
\begin_layout Plain Layout
#define VIRTIO_RING_H
\end_layout
\begin_layout Plain Layout
/* An interface for efficient virtio implementation.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* This header is BSD licensed so anyone can use the definitions
\end_layout
\begin_layout Plain Layout
* to implement compatible drivers/servers.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* Copyright 2007, 2009, IBM Corporation
\end_layout
\begin_layout Plain Layout
* Copyright 2011, Red Hat, Inc
\end_layout
\begin_layout Plain Layout
* All rights reserved.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* Redistribution and use in source and binary forms, with or without
\end_layout
\begin_layout Plain Layout
* modification, are permitted provided that the following conditions
\end_layout
\begin_layout Plain Layout
* are met:
\end_layout
\begin_layout Plain Layout
* 1.
Redistributions of source code must retain the above copyright
\end_layout
\begin_layout Plain Layout
* notice, this list of conditions and the following disclaimer.
\end_layout
\begin_layout Plain Layout
* 2.
Redistributions in binary form must reproduce the above copyright
\end_layout
\begin_layout Plain Layout
* notice, this list of conditions and the following disclaimer in the
\end_layout
\begin_layout Plain Layout
* documentation and/or other materials provided with the distribution.
\end_layout
\begin_layout Plain Layout
* 3.
Neither the name of IBM nor the names of its contributors
\end_layout
\begin_layout Plain Layout
* may be used to endorse or promote products derived from this software
\end_layout
\begin_layout Plain Layout
* without specific prior written permission.
\end_layout
\begin_layout Plain Layout
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
IS'' AND
\end_layout
\begin_layout Plain Layout
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
\end_layout
\begin_layout Plain Layout
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
\end_layout
\begin_layout Plain Layout
* ARE DISCLAIMED.
IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
\end_layout
\begin_layout Plain Layout
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
\end_layout
\begin_layout Plain Layout
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
\end_layout
\begin_layout Plain Layout
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
\end_layout
\begin_layout Plain Layout
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
\end_layout
\begin_layout Plain Layout
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
WAY
\end_layout
\begin_layout Plain Layout
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
\end_layout
\begin_layout Plain Layout
* SUCH DAMAGE.
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as continuing via the next field.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_NEXT 1
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as write-only (otherwise read-only).
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_WRITE 2
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* The Host uses this in used->flags to advise the Guest: don't kick me
\end_layout
\begin_layout Plain Layout
* when you add a buffer.
It's unreliable, so it's simply an
\end_layout
\begin_layout Plain Layout
* optimization.
Guest will still kick if it's out of buffers.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_USED_F_NO_NOTIFY 1
\end_layout
\begin_layout Plain Layout
/* The Guest uses this in avail->flags to advise the Host: don't
\end_layout
\begin_layout Plain Layout
* interrupt me when you consume a buffer.
It's unreliable, so it's
\end_layout
\begin_layout Plain Layout
* simply an optimization.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_AVAIL_F_NO_INTERRUPT 1
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* Virtio ring descriptors: 16 bytes.
\end_layout
\begin_layout Plain Layout
* These can chain together via "next".
*/
\end_layout
\begin_layout Plain Layout
struct vring_desc {
\end_layout
\begin_layout Plain Layout
/* Address (guest-physical).
*/
\end_layout
\begin_layout Plain Layout
uint64_t addr;
\end_layout
\begin_layout Plain Layout
/* Length.
*/
\end_layout
\begin_layout Plain Layout
uint32_t len;
\end_layout
\begin_layout Plain Layout
/* The flags as indicated above.
*/
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
/* We chain unused descriptors via this, too */
\end_layout
\begin_layout Plain Layout
uint16_t next;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_avail {
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
uint16_t idx;
\end_layout
\begin_layout Plain Layout
uint16_t ring[];
\end_layout
\begin_layout Plain Layout
uint16_t used_event;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* u32 is used here for ids for padding reasons.
*/
\end_layout
\begin_layout Plain Layout
struct vring_used_elem {
\end_layout
\begin_layout Plain Layout
/* Index of start of used descriptor chain.
*/
\end_layout
\begin_layout Plain Layout
uint32_t id;
\end_layout
\begin_layout Plain Layout
/* Total length of the descriptor chain which was written to.
*/
\end_layout
\begin_layout Plain Layout
uint32_t len;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_used {
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
uint16_t idx;
\end_layout
\begin_layout Plain Layout
struct vring_used_elem ring[];
\end_layout
\begin_layout Plain Layout
uint16_t avail_event;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring {
\end_layout
\begin_layout Plain Layout
unsigned int num;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_desc *desc;
\end_layout
\begin_layout Plain Layout
struct vring_avail *avail;
\end_layout
\begin_layout Plain Layout
struct vring_used *used;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* The standard layout for the ring is a continuous chunk of memory which
\end_layout
\begin_layout Plain Layout
* looks like this.
We assume num is a power of 2.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* struct vring {
\end_layout
\begin_layout Plain Layout
* // The actual descriptors (16 bytes each)
\end_layout
\begin_layout Plain Layout
* struct vring_desc desc[num];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // A ring of available descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
* __u16 avail_flags;
\end_layout
\begin_layout Plain Layout
* __u16 avail_idx;
\end_layout
\begin_layout Plain Layout
* __u16 available[num];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // Padding to the next align boundary.
\end_layout
\begin_layout Plain Layout
* char pad[];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // A ring of used descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
* __u16 used_flags;
\end_layout
\begin_layout Plain Layout
* __u16 EVENT_IDX;
\end_layout
\begin_layout Plain Layout
* struct vring_used_elem used[num];
\end_layout
\begin_layout Plain Layout
* };
\end_layout
\begin_layout Plain Layout
* Note: for virtio PCI, align is 4096.
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
static inline void vring_init(struct vring *vr, unsigned int num, void *p,
\end_layout
\begin_layout Plain Layout
unsigned long align)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
vr->num = num;
\end_layout
\begin_layout Plain Layout
vr->desc = p;
\end_layout
\begin_layout Plain Layout
vr->avail = p + num*sizeof(struct vring_desc);
\end_layout
\begin_layout Plain Layout
vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
\end_layout
\begin_layout Plain Layout
+ align-1)
\end_layout
\begin_layout Plain Layout
& ~(align - 1));
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
static inline unsigned vring_size(unsigned int num, unsigned long align)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
return ((sizeof(struct vring_desc)*num + sizeof(uint16_t)*(2+num)
\end_layout
\begin_layout Plain Layout
+ align - 1) & ~(align - 1))
\end_layout
\begin_layout Plain Layout
+ sizeof(uint16_t)*3 + sizeof(struct vring_used_elem)*num;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
static inline int vring_need_event(uint16_t event_idx, uint16_t new_idx,
uint16_t old_idx)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx
- old_idx);
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
#endif /* VIRTIO_RING_H */
\end_layout
\end_inset
\end_layout
\begin_layout Chapter*
\begin_inset CommandInset label
LatexCommand label
name "cha:Reserved-Feature-Bits"
\end_inset
Appendix B: Reserved Feature Bits
\end_layout
\begin_layout Standard
Currently there are five device-independent feature bits defined:
\end_layout
\begin_layout Description
VIRTIO_F_NOTIFY_ON_EMPTY
\begin_inset space ~
\end_inset
(24) Negotiating this feature indicates that the driver wants an interrupt
if the device runs out of available descriptors on a virtqueue, even though
interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT flag or
the used_event field.
An example of this is the networking driver: it doesn't need to know every
time a packet is transmitted, but it does need to free the transmitted
packets a finite time after they are transmitted.
It can avoid using a timer if the device interrupts it when all the packets
are transmitted.
\end_layout
\begin_layout Description
VIRTIO_F_RING_INDIRECT_DESC
\begin_inset space ~
\end_inset
(28) Negotiating this feature indicates that the driver can use descriptors
with the VRING_DESC_F_INDIRECT flag set, as described in
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Indirect-Descriptors"
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_F_RING_EVENT_IDX(29) This feature enables the
\emph on
used_event
\emph default
and the
\emph on
avail_event
\emph default
fields.
If set, it indicates that the device should ignore the
\emph on
flags
\emph default
field in the available ring structure.
Instead, the
\emph on
used_event
\emph default
field in this structure is used by guest to suppress device interrupts.
Further, the driver should ignore the
\emph on
flags
\emph default
field in the used ring structure.
Instead, the
\emph on
avail_event
\emph default
field in this structure is used by the device to suppress notifications.
If unset, the driver should ignore the
\emph on
used_event
\emph default
field; the device should ignore the
\emph on
avail_event
\emph default
field; the
\emph on
flags
\emph default
field is used
\end_layout
\begin_layout Chapter*
Appendix C: Network Device
\end_layout
\begin_layout Standard
The virtio network device is a virtual ethernet card, and is the most complex
of the devices supported so far by virtio.
It has enhanced rapidly and demonstrates clearly how support for new features
should be added to an existing device.
Empty buffers are placed in one virtqueue for receiving packets, and outgoing
packets are enqueued into another for transmission in that order.
A third command queue is used to control advanced filtering features.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 1
\end_layout
\begin_layout Description
Virtqueues 0:receiveq.
1:transmitq.
2:controlq
\begin_inset Foot
status open
\begin_layout Plain Layout
Only if VIRTIO_NET_F_CTRL_VQ set
\end_layout
\end_inset
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_NET_F_CSUM
\begin_inset space ~
\end_inset
(0) Device handles packets with partial checksum
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_CSUM
\begin_inset space ~
\end_inset
(1) Guest handles packets with partial checksum
\end_layout
\begin_layout Description
VIRTIO_NET_F_MAC
\begin_inset space ~
\end_inset
(5) Device has given MAC address.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GSO
\begin_inset space ~
\end_inset
(6) (Deprecated) device handles packets with any GSO type.
\begin_inset Foot
status open
\begin_layout Plain Layout
It was supposed to indicate segmentation offload support, but upon further
investigation it became clear that multiple bits were required.
\end_layout
\end_inset
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_TSO4
\begin_inset space ~
\end_inset
(7) Guest can receive TSOv4.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_TSO6
\begin_inset space ~
\end_inset
(8) Guest can receive TSOv6.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_ECN
\begin_inset space ~
\end_inset
(9) Guest can receive TSO with ECN.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_UFO
\begin_inset space ~
\end_inset
(10) Guest can receive UFO.
\end_layout
\begin_layout Description
VIRTIO_NET_F_HOST_TSO4
\begin_inset space ~
\end_inset
(11) Device can receive TSOv4.
\end_layout
\begin_layout Description
VIRTIO_NET_F_HOST_TSO6
\begin_inset space ~
\end_inset
(12) Device can receive TSOv6.
\end_layout
\begin_layout Description
VIRTIO_NET_F_HOST_ECN
\begin_inset space ~
\end_inset
(13) Device can receive TSO with ECN.
\end_layout
\begin_layout Description
VIRTIO_NET_F_HOST_UFO
\begin_inset space ~
\end_inset
(14) Device can receive UFO.
\end_layout
\begin_layout Description
VIRTIO_NET_F_MRG_RXBUF
\begin_inset space ~
\end_inset
(15) Guest can merge receive buffers.
\end_layout
\begin_layout Description
VIRTIO_NET_F_STATUS
\begin_inset space ~
\end_inset
(16) Configuration status field is available.
\end_layout
\begin_layout Description
VIRTIO_NET_F_CTRL_VQ
\begin_inset space ~
\end_inset
(17) Control channel is available.
\end_layout
\begin_layout Description
VIRTIO_NET_F_CTRL_RX
\begin_inset space ~
\end_inset
(18) Control channel RX mode support.
\end_layout
\begin_layout Description
VIRTIO_NET_F_CTRL_VLAN
\begin_inset space ~
\end_inset
(19) Control channel VLAN filtering.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous packets.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout Two configuration fields are currently defined.
The mac address field always exists (though is only valid if VIRTIO_NET_F_MAC
is set), and the status field only exists if VIRTIO_NET_F_STATUS is set.
Two read-only bits are currently defined for the status field: VIRTIO_NET_S_LIN
K_UP and VIRTIO_NET_S_ANNOUNCE.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_NET_S_LINK_UP 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_S_ANNOUNCE 2
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct virtio_net_config {
\end_layout
\begin_layout Plain Layout
u8 mac[6];
\end_layout
\begin_layout Plain Layout
u16 status;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The initialization routine should identify the receive and transmission
virtqueues.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_MAC feature bit is set, the configuration space
\begin_inset Quotes eld
\end_inset
mac
\begin_inset Quotes erd
\end_inset
entry indicates the
\begin_inset Quotes eld
\end_inset
physical
\begin_inset Quotes erd
\end_inset
address of the the network card, otherwise a private MAC address should
be assigned.
All guests are expected to negotiate this feature if it is set.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify the control
virtqueue.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link status can
be read from the bottom bit of the
\begin_inset Quotes eld
\end_inset
status
\begin_inset Quotes erd
\end_inset
config field.
Otherwise, the link should be assumed active.
\end_layout
\begin_layout Enumerate
The receive virtqueue should be filled with receive buffers.
This is described in detail below in
\begin_inset Quotes eld
\end_inset
Setting Up Receive Buffers
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Enumerate
A driver can indicate that it will generate checksumless packets by negotating
the VIRTIO_NET_F_CSUM feature.
This
\begin_inset Quotes eld
\end_inset
checksum offload
\begin_inset Quotes erd
\end_inset
is a common feature on modern network cards.
\end_layout
\begin_layout Enumerate
If that feature is negotiated
\begin_inset Foot
status open
\begin_layout Plain Layout
ie.
VIRTIO_NET_F_HOST_TSO* and VIRTIO_NET_F_HOST_UFO are dependent on VIRTIO_NET_F_
CSUM; a dvice which offers the offload features must offer the checksum
feature, and a driver which accepts the offload features must accept the
checksum feature.
Similar logic applies to the VIRTIO_NET_F_GUEST_TSO4 features depending
on VIRTIO_NET_F_GUEST_CSUM.
\end_layout
\end_inset
, a driver can use TCP or UDP segmentation offload by negotiating the VIRTIO_NET
_F_HOST_TSO4 (IPv4 TCP), VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST
_UFO (UDP fragmentation) features.
It should not send TCP packets requiring segmentation offload which have
the Explicit Congestion Notification bit set, unless the VIRTIO_NET_F_HOST_ECN
feature is negotiated.
\begin_inset Foot
status open
\begin_layout Plain Layout
This is a common restriction in real, older network cards.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
The converse features are also available: a driver can save the virtual
device some work by negotiating these features.
\begin_inset Foot
status open
\begin_layout Plain Layout
For example, a network packet transported between two guests on the same
system may not require checksumming at all, nor segmentation, if both guests
are amenable.
\end_layout
\end_inset
The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially checksummed
packets can be received, and if it can do that then the VIRTIO_NET_F_GUEST_TSO4
, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN
are the input equivalents of the features described above.
See
\begin_inset Quotes eld
\end_inset
Receiving Packets
\begin_inset Quotes erd
\end_inset
below.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Standard
Packets are transmitted by placing them in the transmitq, and buffers for
incoming packets are placed in the receiveq.
In each case, the packet itself is preceeded by a header:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_net_hdr {
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1
\end_layout
\begin_layout Plain Layout
u8 flags;
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_NONE 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_TCPV4 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_UDP 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_TCPV6 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_ECN 0x80
\end_layout
\begin_layout Plain Layout
u8 gso_type;
\end_layout
\begin_layout Plain Layout
u16 hdr_len;
\end_layout
\begin_layout Plain Layout
u16 gso_size;
\end_layout
\begin_layout Plain Layout
u16 csum_start;
\end_layout
\begin_layout Plain Layout
u16 csum_offset;
\end_layout
\begin_layout Plain Layout
/* Only if VIRTIO_NET_F_MRG_RXBUF: */
\end_layout
\begin_layout Plain Layout
u16 num_buffers
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The controlq is used to control device features such as filtering.
\end_layout
\begin_layout Subsection*
Packet Transmission
\end_layout
\begin_layout Standard
Transmitting a single packet is simple, but varies depending on the different
features the driver negotiated.
\end_layout
\begin_layout Enumerate
If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has not been
fully checksummed, then the virtio_net_hdr's fields are set as follows.
Otherwise, the packet must be fully checksummed, and flags is zero.
\end_layout
\begin_deeper
\begin_layout Itemize
flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
\end_layout
\begin_layout Itemize
\begin_inset CommandInset label
LatexCommand label
name "ite:csum_start-is-set"
\end_inset
csum_start is set to the offset within the packet to begin checksumming,
and
\end_layout
\begin_layout Itemize
csum_offset indicates how many bytes after the csum_start the new (16 bit
ones' complement) checksum should be placed.
\begin_inset Foot
status open
\begin_layout Plain Layout
For example, consider a partially checksummed TCP (IPv4) packet.
It will have a 14 byte ethernet header and 20 byte IP header followed by
the TCP header (with the TCP checksum field 16 bytes into that header).
csum_start will be 14+20 = 34 (the TCP checksum includes the header), and
csum_offset will be 16.
The value in the TCP checksum field should be initialized to the sum of
the TCP pseudo header, so that replacing it by the ones' complement checksum
of the TCP header and body will give the correct result.
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
\begin_inset CommandInset label
LatexCommand label
name "enu:If-the-driver"
\end_inset
If the driver negotiated VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet
requires TCP segmentation or UDP fragmentation, then the
\begin_inset Quotes eld
\end_inset
gso_type
\begin_inset Quotes erd
\end_inset
field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP.
(Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE).
In this case, packets larger than 1514 bytes can be transmitted: the metadata
indicates how to replicate the packet header to cut it into smaller packets.
The other gso fields are set:
\end_layout
\begin_deeper
\begin_layout Itemize
hdr_len is a hint to the device as to how much of the header needs to be
kept to copy into each packet, usually set to the length of the headers,
including the transport header.
\begin_inset Foot
status open
\begin_layout Plain Layout
Due to various bugs in implementations, this field is not useful as a guarantee
of the transport header size.
\end_layout
\end_inset
\end_layout
\begin_layout Itemize
gso_size is the maximum size of each packet beyond that header (ie.
MSS).
\end_layout
\begin_layout Itemize
If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the VIRTIO_NET_HDR_G
SO_ECN bit may be set in
\begin_inset Quotes eld
\end_inset
gso_type
\begin_inset Quotes erd
\end_inset
as well, indicating that the TCP packet has the ECN bit set.
\begin_inset Foot
status open
\begin_layout Plain Layout
This case is not handled by some older hardware, so is called out specifically
in the protocol.
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, the num_buffers
field is set to zero.
\end_layout
\begin_layout Enumerate
The header and packet are added as one output buffer to the transmitq, and
the device is notified of the new entry (see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Notifying-The-Device"
\end_inset
).
\begin_inset Foot
status open
\begin_layout Plain Layout
Note that the header will be two bytes longer for the VIRTIO_NET_F_MRG_RXBUF
case.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection*
Packet Transmission Interrupt
\end_layout
\begin_layout Standard
Often a driver will suppress transmission interrupts using the VRING_AVAIL_F_NO_
INTERRUPT flag (see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Receiving-Used-Buffers"
\end_inset
) and check for used packets in the transmit path of following packets.
However, it will still receive interrupts if the VIRTIO_F_NOTIFY_ON_EMPTY
feature is negotiated, indicating that the transmission queue is completely
emptied.
\end_layout
\begin_layout Standard
The normal behavior in this interrupt handler is to retrieve and new descriptors
from the used ring and free the corresponding headers and packets.
\end_layout
\begin_layout Subsection*
Setting Up Receive Buffers
\end_layout
\begin_layout Standard
It is generally a good idea to keep the receive virtqueue as fully populated
as possible: if it runs out, network performance will suffer.
\end_layout
\begin_layout Standard
If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or VIRTIO_NET_F_GUEST_UF
O features are used, the Guest will need to accept packets of up to 65550
bytes long (the maximum size of a TCP or UDP packet, plus the 14 byte ethernet
header), otherwise 1514 bytes.
So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every buffer in the receive
queue needs to be at least this length
\begin_inset Foot
status open
\begin_layout Plain Layout
Obviously each one can be split across multiple descriptor elements.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at least the
size of the
\family typewriter
struct virtio_net_hdr
\family default
.
\end_layout
\begin_layout Subsection*
Packet Receive Interrupt
\end_layout
\begin_layout Standard
When a packet is copied into a buffer in the receiveq, the optimal path
is to disable further interrupts for the receiveq (see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Receiving-Used-Buffers"
\end_inset
) and process packets until no more are found, then re-enable them.
\end_layout
\begin_layout Standard
Processing packet involves:
\end_layout
\begin_layout Enumerate
If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, then the
\begin_inset Quotes eld
\end_inset
num_buffers
\begin_inset Quotes erd
\end_inset
field indicates how many descriptors this packet is spread over (including
this one).
This allows receipt of large packets without having to allocate large buffers.
In this case, there will be at least
\begin_inset Quotes eld
\end_inset
num_buffers
\begin_inset Quotes erd
\end_inset
in the used ring, and they should be chained together to form a single
packet.
The other buffers will
\emph on
not
\emph default
begin with a
\family typewriter
struct virtio_net_hdr
\family default
.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or the
\begin_inset Quotes eld
\end_inset
num_buffers
\begin_inset Quotes erd
\end_inset
field is one, then the entire packet will be contained within this buffer,
immediately following the
\family typewriter
struct virtio_net_hdr
\family default
.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEED
S_CSUM bit in the
\begin_inset Quotes eld
\end_inset
flags
\begin_inset Quotes erd
\end_inset
field may be set: if so, the checksum on the packet is incomplete and the
\begin_inset Quotes eld
\end_inset
csum_start
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
csum_offset
\begin_inset Quotes erd
\end_inset
fields indicate how to calculate it (see
\begin_inset CommandInset ref
LatexCommand ref
reference "ite:csum_start-is-set"
\end_inset
).
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then
the
\begin_inset Quotes eld
\end_inset
gso_type
\begin_inset Quotes erd
\end_inset
may be something other than VIRTIO_NET_HDR_GSO_NONE, and the
\begin_inset Quotes eld
\end_inset
gso_size
\begin_inset Quotes erd
\end_inset
field indicates the desired MSS (see
\begin_inset CommandInset ref
LatexCommand ref
reference "enu:If-the-driver"
\end_inset
).
\end_layout
\begin_layout Subsection*
Control Virtqueue
\end_layout
\begin_layout Standard
The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is negotiated)
to send commands to manipulate various features of the device which would
not easily map into the configuration space.
\end_layout
\begin_layout Standard
All commands are of the following form:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_net_ctrl {
\end_layout
\begin_layout Plain Layout
u8 class;
\end_layout
\begin_layout Plain Layout
u8 command;
\end_layout
\begin_layout Plain Layout
u8 command-specific-data[];
\end_layout
\begin_layout Plain Layout
u8 ack;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* ack values */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_OK 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_ERR 1
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The class, command and command-specific-data are set by the driver, and
the device sets the ack byte.
There is little it can do except issue a diagnostic if the ack byte is
not VIRTIO_NET_OK.
\end_layout
\begin_layout Subsection*
Packet Receive Filtering
\end_layout
\begin_layout Standard
If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can send control
commands for promiscuous mode, multicast receiving, and filtering of MAC
addresses.
\end_layout
\begin_layout Standard
Note that in general, these commands are best-effort: unwanted packets may
still arrive.
\end_layout
\begin_layout Subsubsection*
Setting Promiscuous Mode
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_RX 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_RX_PROMISC 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_RX_ALLMULTI 1
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The class VIRTIO_NET_CTRL_RX has two commands: VIRTIO_NET_CTRL_RX_PROMISC
turns promiscuous mode on and off, and VIRTIO_NET_CTRL_RX_ALLMULTI turns
all-multicast receive on and off.
The command-specific-data is one byte containing 0 (off) or 1 (on).
\end_layout
\begin_layout Subsubsection*
Setting MAC Address Filtering
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_net_ctrl_mac {
\end_layout
\begin_layout Plain Layout
u32 entries;
\end_layout
\begin_layout Plain Layout
u8 macs[entries][ETH_ALEN];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_MAC 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_MAC_TABLE_SET 0
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The device can filter incoming packets by any number of destination MAC
addresses.
\begin_inset Foot
status collapsed
\begin_layout Plain Layout
Since there are no guarentees, it can use a hash filter orsilently switch
to allmulti or promiscuous mode if it is given too many addresses.
\end_layout
\end_inset
This table is set using the class VIRTIO_NET_CTRL_MAC and the command VIRTIO_NE
T_CTRL_MAC_TABLE_SET.
The command-specific-data is two variable length tables of 6-byte MAC addresses.
The first table contains unicast addresses, and the second contains multicast
addresses.
\end_layout
\begin_layout Subsection*
VLAN Filtering
\end_layout
\begin_layout Standard
If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it can control
a VLAN filter table in the device.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_VLAN 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_VLAN_ADD 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_VLAN_DEL 1
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL command take
a 16-bit VLAN id as the command-specific-data.
\end_layout
\begin_layout Subsection*
Gratuitous Packet Sending
\end_layout
\begin_layout Standard
If the driver negotiates the VIRTIO_NET_F_GUEST_ANNOUNCE (depends on VIRTIO_NET_
F_CTRL_VQ), it can ask the guest to send gratuitous packets; this is usually
done after the guest has been physically migrated, and needs to announce
its presence on the new network links.
(As hypervisor does not have the knowledge of guest network configuration
(eg.
tagged vlan) it is simplest to prod the guest in this way).
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_ANNOUNCE 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The Guest needs to check VIRTIO_NET_S_ANNOUNCE bit in status field when
it notices the changes of device configuration.
The command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that driver
has recevied the notification and device would clear the VIRTIO_NET_S_ANNOUNCE
bit in the status filed after it received this command.
\end_layout
\begin_layout Standard
Processing this notification involves:
\end_layout
\begin_layout Enumerate
Sending the gratuitous packets or marking there are pending gratuitous packets
to be sent and letting deferred routine to send them.
\end_layout
\begin_layout Enumerate
Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control vq.
\end_layout
\begin_layout Enumerate
.
\end_layout
\begin_layout Chapter*
Appendix D: Block Device
\end_layout
\begin_layout Standard
The virtio block device is a simple virtual block device (ie.
disk).
Read and write requests (and other exotic requests) are placed in the queue,
and serviced (probably out of order) by the device except where noted.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 2
\end_layout
\begin_layout Description
Virtqueues 0:requestq.
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_BLK_F_BARRIER
\begin_inset space ~
\end_inset
(0) Host supports request barriers.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_SIZE_MAX
\begin_inset space ~
\end_inset
(1) Maximum size of any single segment is in
\begin_inset Quotes eld
\end_inset
size_max
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_SEG_MAX
\begin_inset space ~
\end_inset
(2) Maximum number of segments in a request is in
\begin_inset Quotes eld
\end_inset
seg_max
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_GEOMETRY
\begin_inset space ~
\end_inset
(4) Disk-style geometry specified in
\begin_inset Quotes eld
\end_inset
geometry
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_RO
\begin_inset space \space{}
\end_inset
(5) Device is read-only.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_BLK_SIZE
\begin_inset space ~
\end_inset
(6) Block size of disk is in
\begin_inset Quotes eld
\end_inset
blk_size
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_FLUSH (9) Cache flush command support.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout The capacity of the device (expressed in 512-byte sectors) is always
present.
The availability of the others all depend on various feature bits as indicated
above.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_blk_config {
\end_layout
\begin_layout Plain Layout
u64 capacity;
\end_layout
\begin_layout Plain Layout
u32 size_max;
\end_layout
\begin_layout Plain Layout
u32 seg_max;
\end_layout
\begin_layout Plain Layout
struct virtio_blk_geometry {
\end_layout
\begin_layout Plain Layout
u16 cylinders;
\end_layout
\begin_layout Plain Layout
u8 heads;
\end_layout
\begin_layout Plain Layout
u8 sectors;
\end_layout
\begin_layout Plain Layout
} geometry;
\end_layout
\begin_layout Plain Layout
u32 blk_size;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The device size should be read from the
\begin_inset Quotes eld
\end_inset
capacity
\begin_inset Quotes erd
\end_inset
configuration field.
No requests should be submitted which goes beyond this limit.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the blk_size field can
be read to determine the optimal sector size for the driver to use.
This does not effect the units used in the protocol (always 512 bytes),
but awareness of the correct value can effect performance.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BLK_F_RO feature is set by the device, any write requests
will fail.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Standard
The driver queues requests to the virtqueue, and they are used by the device
(not necessarily in order).
Each request is of form:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_blk_req {
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
u32 type;
\end_layout
\begin_layout Plain Layout
u32 ioprio;
\end_layout
\begin_layout Plain Layout
u64 sector;
\end_layout
\begin_layout Plain Layout
char data[][512];
\end_layout
\begin_layout Plain Layout
u8 status;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
If the device has VIRTIO_BLK_F_SCSI feature, it can also support scsi packet
command requests, each of these requests is of form:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_scsi_pc_req {
\end_layout
\begin_layout Plain Layout
u32 type;
\end_layout
\begin_layout Plain Layout
u32 ioprio;
\end_layout
\begin_layout Plain Layout
u64 sector;
\end_layout
\begin_layout Plain Layout
char cmd[];
\end_layout
\begin_layout Plain Layout
char data[][512];
\end_layout
\begin_layout Plain Layout
#define SCSI_SENSE_BUFFERSIZE 96
\end_layout
\begin_layout Plain Layout
u8 sense[SCSI_SENSE_BUFFERSIZE];
\end_layout
\begin_layout Plain Layout
u32 errors;
\end_layout
\begin_layout Plain Layout
u32 data_len;
\end_layout
\begin_layout Plain Layout
u32 sense_len;
\end_layout
\begin_layout Plain Layout
u32 residual;
\end_layout
\begin_layout Plain Layout
u8 status;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The
\emph on
type
\emph default
of the request is either a read (VIRTIO_BLK_T_IN), a write (VIRTIO_BLK_T_OUT),
a scsi packet command (VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT
\begin_inset Foot
status open
\begin_layout Plain Layout
the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device does not
distinguish between them
\end_layout
\end_inset
) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT
\begin_inset Foot
status open
\begin_layout Plain Layout
the FLUSH and FLUSH_OUT types are equivalent, the device does not distinguish
between them
\end_layout
\end_inset
).
If the device has VIRTIO_BLK_F_BARRIER feature
\begin_inset space ~
\end_inset
the high bit (VIRTIO_BLK_T_BARRIER) indicates that this request acts as
a barrier and that all preceeding requests must be complete before this
one, and all following requests must not be started until this is complete.
Note that a barrier does not flush caches in the underlying backend device
in host, and thus does not serve as data consistency guarantee.
Driver must use FLUSH request to flush the host cache.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_BLK_T_IN 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_OUT 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_SCSI_CMD 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_SCSI_CMD_OUT 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_FLUSH 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_FLUSH_OUT 5
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_BARRIER 0x80000000
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The
\emph on
ioprio
\emph default
field is a hint about the relative priorities of requests to the device:
higher numbers indicate more important requests.
\end_layout
\begin_layout Standard
The
\emph on
sector
\emph default
number indicates the offset (multiplied by 512) where the read or write
is to occur.
This field is unused and set to 0 for scsi packet commands and for flush
commands.
\end_layout
\begin_layout Standard
The
\emph on
cmd
\emph default
field is only present for scsi packet command requests, and indicates the
command to perform.
This field must reside in a single, separate read-only buffer; command
length can be derived from the length of this buffer.
\end_layout
\begin_layout Standard
Note that these first three (four for scsi packet commands) fields are always
read-only: the
\emph on
data
\emph default
field is either read-only or write-only, depending on the request.
The size of the read or write can be derived from the total size of the
request buffers.
\end_layout
\begin_layout Standard
The
\emph on
sense
\emph default
field is only present for scsi packet command requests, and indicates the
buffer for scsi sense data.
\end_layout
\begin_layout Standard
The
\emph on
data_len
\emph default
field is only present for scsi packet command requests, this field is deprecate
d, and should be ignored by the driver.
Historically, devices copied data length there.
\end_layout
\begin_layout Standard
The
\emph on
sense_len
\emph default
field is only present for scsi packet command requests and indicates the
number of bytes actually written to the
\emph on
sense
\emph default
buffer.
\end_layout
\begin_layout Standard
The
\emph on
residual
\emph default
field is only present for scsi packet command requests and indicates the
residual size, calculated as data length - number of bytes actually transferred.
\end_layout
\begin_layout Standard
The final
\emph on
status
\emph default
byte is written by the device: either VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S
_IOERR for host or guest error or VIRTIO_BLK_S_UNSUPP for a request unsupported
by host:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_BLK_S_OK 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_S_IOERR 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_S_UNSUPP 2
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Historically, devices assumed that the fields
\emph on
type
\emph default
,
\emph on
ioprio
\emph default
and
\emph on
sector
\emph default
reside in a single, separate read-only buffer; the fields
\emph on
errors
\emph default
,
\emph on
data_len
\emph default
,
\emph on
sense_len
\emph default
and
\emph on
residual
\emph default
reside in a single, separate write-only buffer; the
\emph on
sense
\emph default
field in a separate write-only buffer of size 96 bytes, by itself; the
fields
\emph on
errors
\emph default
,
\emph on
data_len
\emph default
,
\emph on
sense_len
\emph default
and
\emph on
residual
\emph default
in a single write-only buffer; and the
\emph on
status
\emph default
field is a separate read-only buffer of size 1 byte, by itself.
\end_layout
\begin_layout Chapter*
Appendix E: Console Device
\end_layout
\begin_layout Standard
The virtio console device is a simple device for data input and output.
A device may have one or more ports.
Each port has a pair of input and output virtqueues.
Moreover, a device has a pair of control IO virtqueues.
The control virtqueues are used to communicate information between the
device and the driver about ports being opened and closed on either side
of the connection, indication from the host about whether a particular
port is a console port, adding new ports, port hot-plug/unplug, etc., and
indication from the guest about whether a port or a device was successfully
added, port open/close, etc..
For data IO, one or more empty buffers are placed in the receive queue
for incoming data and outgoing characters are placed in the transmit queue.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 3
\end_layout
\begin_layout Description
Virtqueues 0:receiveq(port0).
1:transmitq(port0), 2:control receiveq
\begin_inset Foot
status open
\begin_layout Plain Layout
Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
\end_layout
\end_inset
, 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1), ...
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_CONSOLE_F_SIZE
\begin_inset space ~
\end_inset
(0) Configuration cols and rows fields are valid.
\end_layout
\begin_layout Description
VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple ports; configurati
on fields nr_ports and max_nr_ports are valid and control virtqueues will
be used.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout The size of the console is supplied in the configuration space if
the VIRTIO_CONSOLE_F_SIZE feature is set.
Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature is set, the maximum
number of ports supported by the device can be fetched.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_console_config {
\end_layout
\begin_layout Plain Layout
u16 cols;
\end_layout
\begin_layout Plain Layout
u16 rows;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
u32 max_nr_ports;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver can read
the console dimensions from the configuration fields.
\end_layout
\begin_layout Enumerate
If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the driver can
spawn multiple ports, not all of which may be attached to a console.
Some could be generic ports.
In this case, the control virtqueues are enabled and according to the max_nr_po
rts configuration-space value, the appropriate number of virtqueues are
created.
A control message indicating the driver is ready is sent to the host.
The host can then send control messages for adding new ports to the device.
After creating and initializing each port, a VIRTIO_CONSOLE_PORT_READY
control message is sent to the host for that port so the host can let us
know of any additional configuration options set for that port.
\end_layout
\begin_layout Enumerate
The receiveq for each port is populated with one or more receive buffers.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Enumerate
For output, a buffer containing the characters is placed in the port's transmitq.
\begin_inset Foot
status open
\begin_layout Plain Layout
Because this is high importance and low bandwidth, the current Linux implementat
ion polls for the buffer to be used, rather than waiting for an interrupt,
simplifying the implementation significantly.
However, for generic serial ports with the O_NONBLOCK flag set, the polling
limitation is relaxed and the consumed buffers are freed upon the next
write or poll call or when a port is closed or hot-unplugged.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
When a buffer is used in the receiveq (signalled by an interrupt), the contents
is the input to the port associated with the virtqueue for which the notificati
on was received.
\end_layout
\begin_layout Enumerate
If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a configuration
change interrupt may occur.
The updated size can be read from the configuration fields.
\end_layout
\begin_layout Enumerate
If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT feature, active
ports are announced by the host using the VIRTIO_CONSOLE_PORT_ADD control
message.
The same message is used for port hot-plug as well.
\end_layout
\begin_layout Enumerate
If the host specified a port `name', a sysfs attribute is created with the
name filled in, so that udev rules can be written that can create a symlink
from the port's name to the char device for port discovery by applications
in the guest.
\end_layout
\begin_layout Enumerate
Changes to ports' state are effected by control messages.
Appropriate action is taken on the port indicated in the control message.
The layout of the structure of the control buffer and the events associated
are:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_console_control {
\end_layout
\begin_layout Plain Layout
uint32_t id; /* Port number */
\end_layout
\begin_layout Plain Layout
uint16_t event; /* The kind of control event */
\end_layout
\begin_layout Plain Layout
uint16_t value; /* Extra information for the event */
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* Some events for the internal messages (control packets) */
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_DEVICE_READY 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_ADD 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_REMOVE 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_READY 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_CONSOLE_PORT 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_RESIZE 5
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_OPEN 6
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_NAME 7
\end_layout
\end_inset
\end_layout
\begin_layout Chapter*
Appendix F: Entropy Device
\end_layout
\begin_layout Standard
The virtio entropy device supplies high-quality randomness for guest use.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 4
\end_layout
\begin_layout Description
Virtqueues 0:requestq.
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits None currently defined
\end_layout
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout None currently defined.
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The virtqueue is initialized
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Standard
When the driver requires random bytes, it places the descriptor of one or
more buffers in the queue.
It will be completely filled by random data by the device.
\end_layout
\begin_layout Chapter*
Appendix G: Memory Balloon Device
\end_layout
\begin_layout Standard
The virtio memory balloon device is a primitive device for managing guest
memory: the device asks for a certain amount of memory, and the guest supplies
it (or withdraws it, if the device has more than it asks for).
This allows the guest to adapt to changes in allowance of underlying physical
memory.
If the feature is negotiated, the device can also be used to communicate
guest memory statistics to the host.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 5
\end_layout
\begin_layout Description
Virtqueues 0:inflateq.
1:deflateq.
2:statsq.
\begin_inset Foot
status open
\begin_layout Plain Layout
Only if VIRTIO_BALLON_F_STATS_VQ set
\end_layout
\end_inset
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_BALLOON_F_MUST_TELL_HOST
\begin_inset space ~
\end_inset
(0) Host must be told before pages from the balloon are used.
\end_layout
\begin_layout Description
VIRTIO_BALLOON_F_STATS_VQ
\begin_inset space \space{}
\end_inset
(1) A virtqueue for reporting guest memory statistics is present.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout Both fields of this configuration are always available.
Note that they are little endian, despite convention that device fields
are guest endian:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_balloon_config {
\end_layout
\begin_layout Plain Layout
u32 num_pages;
\end_layout
\begin_layout Plain Layout
u32 actual;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The inflate and deflate virtqueues are identified.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated:
\end_layout
\begin_deeper
\begin_layout Enumerate
Identify the stats virtqueue.
\end_layout
\begin_layout Enumerate
Add one empty buffer to the stats virtqueue and notify the host.
\end_layout
\end_deeper
\begin_layout Standard
Device operation begins immediately.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Description
Memory
\begin_inset space \space{}
\end_inset
Ballooning The device is driven by the receipt of a configuration change
interrupt.
\end_layout
\begin_layout Enumerate
The
\begin_inset Quotes eld
\end_inset
num_pages
\begin_inset Quotes erd
\end_inset
configuration field is examined.
If this is greater than the
\begin_inset Quotes eld
\end_inset
actual
\begin_inset Quotes erd
\end_inset
number of pages, memory must be given to the balloon.
If it is less than the
\begin_inset Quotes eld
\end_inset
actual
\begin_inset Quotes erd
\end_inset
number of pages, memory may be taken back from the balloon for general
use.
\end_layout
\begin_layout Enumerate
To supply memory to the balloon (aka.
inflate):
\end_layout
\begin_deeper
\begin_layout Enumerate
The driver constructs an array of addresses of unused memory pages.
These addresses are divided by 4096
\begin_inset Foot
status open
\begin_layout Plain Layout
This is historical, and independent of the guest page size
\end_layout
\end_inset
and the descriptor describing the resulting 32-bit array is added to the
inflateq.
\end_layout
\end_deeper
\begin_layout Enumerate
To remove memory from the balloon (aka.
deflate):
\end_layout
\begin_deeper
\begin_layout Enumerate
The driver constructs an array of addresses of memory pages it has previously
given to the balloon, as described above.
This descriptor is added to the deflateq.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the guest may not
use these requested pages until that descriptor in the deflateq has been
used by the device.
\end_layout
\begin_layout Enumerate
Otherwise, the guest may begin to re-use pages previously given to the balloon
before the device has acknowledged their withdrawl.
\begin_inset Foot
status open
\begin_layout Plain Layout
In this case, deflation advice is merely a courtesy
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
In either case, once the device has completed the inflation or deflation,
the
\begin_inset Quotes eld
\end_inset
actual
\begin_inset Quotes erd
\end_inset
field of the configuration should be updated to reflect the new number
of pages in the balloon.
\begin_inset Foot
status open
\begin_layout Plain Layout
As updates to configuration space are not atomic, this field isn't particularly
reliable, but can be used to diagnose buggy guests.
\end_layout
\end_inset
\end_layout
\begin_layout Description
Memory
\begin_inset space \space{}
\end_inset
Statistics
\end_layout
\begin_layout Standard
The stats virtqueue is atypical because communication is driven by the device
(not the driver).
The channel becomes active at driver initialization time when the driver
adds an empty buffer and notifies the device.
A request for memory statistics proceeds as follows:
\end_layout
\begin_layout Enumerate
The device pushes the buffer onto the used ring and sends an interrupt.
\end_layout
\begin_layout Enumerate
The driver pops the used buffer and discards it.
\end_layout
\begin_layout Enumerate
The driver collects memory statistics and writes them into a new buffer.
\end_layout
\begin_layout Enumerate
The driver adds the buffer to the virtqueue and notifies the device.
\end_layout
\begin_layout Enumerate
The device pops the buffer (retaining it to initiate a subsequent request)
and consumes the statistics.
\end_layout
\begin_layout Description
Memory
\begin_inset space \space{}
\end_inset
Statistics
\begin_inset space \space{}
\end_inset
Format Each statistic consists of a 16 bit tag and a 64 bit value.
Both quantities are represented in the native endian of the guest.
All statistics are optional and the driver may choose which ones to supply.
To guarantee backwards compatibility, unsupported statistics should be
omitted.
\end_layout
\begin_deeper
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_balloon_stat {
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_SWAP_IN 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_SWAP_OUT 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_MAJFLT 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_MINFLT 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_MEMFREE 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_MEMTOT 5
\end_layout
\begin_layout Plain Layout
u16 tag;
\end_layout
\begin_layout Plain Layout
u64 val;
\end_layout
\begin_layout Plain Layout
} __attribute__((packed));
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Description
Tags
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been swapped in (in
bytes).
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been swapped out
to disk (in bytes).
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_MAJFLT The number of major page faults that have occurred.
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_MINFLT The number of minor page faults that have occurred.
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used for any purpose
(in bytes).
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_MEMTOT The total amount of memory available (in bytes).
\end_layout
\begin_layout Chapter*
Appendix H: Rpmsg: Remote Processor Messaging
\end_layout
\begin_layout Standard
Virtio rpmsg devices represent remote processors on the system which run
in asymmetric multi-processing (AMP) configuration, and which are usually
used to offload cpu-intensive tasks from the main application processor
(a typical SoC methodology).
\end_layout
\begin_layout Standard
Virtio is being used to communicate with those remote processors; empty
buffers are placed in one virtqueue for receiving messages, and non-empty
buffers, containing outbound messages, are enqueued in a second virtqueue
for transmission.
\end_layout
\begin_layout Standard
Numerous communication channels can be multiplexed over those two virtqueues,
so different entities, running on the application and remote processor,
can directly communicate in a point-to-point fashion.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 7
\end_layout
\begin_layout Description
Virtqueues 0:receiveq.
1:transmitq.
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_RPMSG_F_NS
\begin_inset space ~
\end_inset
(0) Device sends (and capable of receiving) name service messages announcing
the creation (or destruction) of a channel:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
/**
\end_layout
\begin_layout Plain Layout
* struct rpmsg_ns_msg - dynamic name service announcement message
\end_layout
\begin_layout Plain Layout
* @name: name of remote service that is published
\end_layout
\begin_layout Plain Layout
* @addr: address of remote service that is published
\end_layout
\begin_layout Plain Layout
* @flags: indicates whether service is created or destroyed
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* This message is sent across to publish a new service (or announce
\end_layout
\begin_layout Plain Layout
* about its removal).
When we receives these messages, an appropriate
\end_layout
\begin_layout Plain Layout
* rpmsg channel (i.e device) is created/destroyed.
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
struct rpmsg_ns_msgoon_config {
\end_layout
\begin_layout Plain Layout
char name[RPMSG_NAME_SIZE];
\end_layout
\begin_layout Plain Layout
u32 addr;
\end_layout
\begin_layout Plain Layout
u32 flags;
\end_layout
\begin_layout Plain Layout
} __packed;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/**
\end_layout
\begin_layout Plain Layout
* enum rpmsg_ns_flags - dynamic name service announcement flags
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* @RPMSG_NS_CREATE: a new remote service was just created
\end_layout
\begin_layout Plain Layout
* @RPMSG_NS_DESTROY: a remote service was just destroyed
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
enum rpmsg_ns_flags {
\end_layout
\begin_layout Plain Layout
RPMSG_NS_CREATE = 0,
\end_layout
\begin_layout Plain Layout
RPMSG_NS_DESTROY = 1,
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout
\end_layout
\begin_layout Standard
At his point none currently defined.
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The initialization routine should identify the receive and transmission
virtqueues.
\end_layout
\begin_layout Enumerate
The receive virtqueue should be filled with receive buffers.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Standard
Messages are transmitted by placing them in the transmitq, and buffers for
inbound messages are placed in the receiveq.
In any case, messages are always preceded by the following header:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
/**
\end_layout
\begin_layout Plain Layout
* struct rpmsg_hdr - common header for all rpmsg messages
\end_layout
\begin_layout Plain Layout
* @src: source address
\end_layout
\begin_layout Plain Layout
* @dst: destination address
\end_layout
\begin_layout Plain Layout
* @reserved: reserved for future use
\end_layout
\begin_layout Plain Layout
* @len: length of payload (in bytes)
\end_layout
\begin_layout Plain Layout
* @flags: message flags
\end_layout
\begin_layout Plain Layout
* @data: @len bytes of message payload data
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* Every message sent(/received) on the rpmsg bus begins with this header.
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
struct rpmsg_hdr {
\end_layout
\begin_layout Plain Layout
u32 src;
\end_layout
\begin_layout Plain Layout
u32 dst;
\end_layout
\begin_layout Plain Layout
u32 reserved;
\end_layout
\begin_layout Plain Layout
u16 len;
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
u8 data[0];
\end_layout
\begin_layout Plain Layout
} __packed;
\end_layout
\end_inset
\end_layout
\begin_layout Chapter*
Appendix I: SCSI Host Device
\end_layout
\begin_layout Standard
The virtio SCSI host device groups together one or more virtual logical
units (such as disks), and allows communicating to them using the SCSI
protocol.
An instance of the device represents a SCSI host to which many targets
and LUNs are attached.
\end_layout
\begin_layout Standard
The virtio SCSI device services two kinds of requests:
\end_layout
\begin_layout Itemize
command requests for a logical unit;
\end_layout
\begin_layout Itemize
task management functions related to a logical unit, target or command.
\end_layout
\begin_layout Standard
The device is also able to send out notifications about added and removed
logical units.
Together, these capabilities provide a SCSI transport protocol that uses
virtqueues as the transfer medium.
In the transport protocol, the virtio driver acts as the initiator, while
the virtio SCSI host provides one or more targets that receive and process
the requests.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 8
\end_layout
\begin_layout Description
Virtqueues 0:controlq; 1:eventq; 2..n:request queues.
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_SCSI_F_INOUT
\begin_inset space ~
\end_inset
(0) A single request can include both read-only and write-only data buffers.
\end_layout
\begin_layout Description
VIRTIO_SCSI_F_HOTPLUG
\begin_inset space ~
\end_inset
(1) The host should enable hot-plug/hot-unplug of new LUNs and targets on
the SCSI bus.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout All fields of this configuration are always available.
\series bold
sense_size
\series default
and
\series bold
cdb_size
\series default
are writable by the guest.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_scsi_config {
\end_layout
\begin_layout Plain Layout
u32 num_queues;
\end_layout
\begin_layout Plain Layout
u32 seg_max;
\end_layout
\begin_layout Plain Layout
u32 max_sectors;
\end_layout
\begin_layout Plain Layout
u32 cmd_per_lun;
\end_layout
\begin_layout Plain Layout
u32 event_info_size;
\end_layout
\begin_layout Plain Layout
u32 sense_size;
\end_layout
\begin_layout Plain Layout
u32 cdb_size;
\end_layout
\begin_layout Plain Layout
u16 max_channel;
\end_layout
\begin_layout Plain Layout
u16 max_target;
\end_layout
\begin_layout Plain Layout
u32 max_lun;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_deeper
\begin_layout Description
num_queues is the total number of request virtqueues exposed by the device.
The driver is free to use only one request queue, or it can use more to
achieve better performance.
\end_layout
\begin_layout Description
seg_max is the maximum number of segments that can be in a command.
A bidirectional command can include
\series bold
seg_max
\series default
input segments and
\series bold
seg_max
\series default
output segments.
\end_layout
\begin_layout Description
max_sectors is a hint to the guest about the maximum transfer size it should
use.
\end_layout
\begin_layout Description
cmd_per_lun is a hint to the guest about the maximum number of linked commands
it should send to one LUN.
The actual value to be used is the minimum of
\series bold
cmd_per_lun
\series default
and the virtqueue size.
\end_layout
\begin_layout Description
event_info_size is the maximum size that the device will fill for buffers
that the driver places in the eventq.
The driver should always put buffers at least of this size.
It is written by the device depending on the set of negotated features.
\end_layout
\begin_layout Description
sense_size is the maximum size of the sense data that the device will write.
The default value is written by the device and will always be 96, but the
driver can modify it.
It is restored to the default when the device is reset.
\end_layout
\begin_layout Description
cdb_size is the maximum size of the CDB that the driver will write.
The default value is written by the device and will always be 32, but the
driver can likewise modify it.
It is restored to the default when the device is reset.
\end_layout
\begin_layout Description
max_channel,
\begin_inset space \space{}
\end_inset
max_target
\series medium
\begin_inset space ~
\end_inset
and
\begin_inset space \space{}
\end_inset
\series default
max_lun can be used by the driver as hints to constrain scanning the logical
units on the host.h
\end_layout
\end_deeper
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Standard
The initialization routine should first of all discover the device's virtqueues.
\end_layout
\begin_layout Standard
If the driver uses the eventq, it should then place at least a buffer in
the eventq.
\end_layout
\begin_layout Standard
The driver can immediately issue requests (for example, INQUIRY or REPORT
LUNS) or task management functions (for example, I_T RESET).
\end_layout
\begin_layout Section*
Device Operation: request queues
\end_layout
\begin_layout Standard
The driver queues requests to an arbitrary request queue, and they are used
by the device on that same queue.
It is the responsibility of the driver to ensure strict request ordering
for commands placed on different queues, because they will be consumed
with no order constraints.
\end_layout
\begin_layout Standard
Requests have the following format:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_scsi_req_cmd {
\end_layout
\begin_layout Plain Layout
// Read-only
\end_layout
\begin_layout Plain Layout
u8 lun[8];
\end_layout
\begin_layout Plain Layout
u64 id;
\end_layout
\begin_layout Plain Layout
u8 task_attr;
\end_layout
\begin_layout Plain Layout
u8 prio;
\end_layout
\begin_layout Plain Layout
u8 crn;
\end_layout
\begin_layout Plain Layout
char cdb[cdb_size];
\end_layout
\begin_layout Plain Layout
char dataout[];
\end_layout
\begin_layout Plain Layout
// Write-only part
\end_layout
\begin_layout Plain Layout
u32 sense_len;
\end_layout
\begin_layout Plain Layout
u32 residual;
\end_layout
\begin_layout Plain Layout
u16 status_qualifier;
\end_layout
\begin_layout Plain Layout
u8 status;
\end_layout
\begin_layout Plain Layout
u8 response;
\end_layout
\begin_layout Plain Layout
u8 sense[sense_size];
\end_layout
\begin_layout Plain Layout
char datain[];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* command-specific response values */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_OK 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_OVERRUN 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_ABORTED 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_BAD_TARGET 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_RESET 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_BUSY 5
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_TARGET_FAILURE 7
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_NEXUS_FAILURE 8
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_FAILURE 9
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* task_attr */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_SIMPLE 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_ORDERED 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_HEAD 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_ACA 3
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The
\series bold
lun
\series default
field addresses a target and logical unit in the virtio-scsi device's SCSI
domain.
The only supported format for the LUN field is: first byte set to 1, second
byte set to target, third and fourth byte representing a single level LUN
structure, followed by four zero bytes.
With this representation, a virtio-scsi device can serve up to 256 targets
and 16384 LUNs per target.
\end_layout
\begin_layout Standard
The
\series bold
id
\series default
field is the command identifier (
\begin_inset Quotes eld
\end_inset
tag
\begin_inset Quotes erd
\end_inset
).
\end_layout
\begin_layout Standard
\series bold
task_attr
\series default
,
\series bold
prio
\series default
and
\series bold
crn
\series default
should be left to zero.
\series bold
task_attr
\series default
defines the task attribute as in the table above, but all task attributes
may be mapped to SIMPLE by the device;
\series bold
crn
\series default
may also be provided by clients, but is generally expected to be 0.
The maximum CRN value defined by the protocol is 255, since CRN is stored
in an 8-bit integer.
\end_layout
\begin_layout Standard
All of these fields are defined in SAM.
They are always read-only, as are the
\series bold
cdb
\series default
and
\series bold
dataout
\series default
field.
The
\series bold
cdb_size
\series default
is taken from the configuration space.
\end_layout
\begin_layout Standard
\series bold
sense
\series default
and subsequent fields are always write-only.
The
\series bold
sense_len
\series default
field indicates the number of bytes actually written to the sense buffer.
The
\series bold
residual
\series default
field indicates the residual size, calculated as
\begin_inset Quotes eld
\end_inset
data_length - number_of_transferred_bytes
\begin_inset Quotes erd
\end_inset
, for read or write operations.
For bidirectional commands, the number_of_transferred_bytes includes both
read and written bytes.
A residual field that is less than the size of datain means that the dataout
field was processed entirely.
A residual field that exceeds the size of datain means that the dataout
field was processed partially and the datain field was not processed at
all.
\end_layout
\begin_layout Standard
The
\series bold
status
\series default
byte is written by the device to be the status code as defined in SAM.
\end_layout
\begin_layout Standard
The
\series bold
response
\series default
byte is written by the device to be one of the following:
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_OK when the request was completed and the status byte is filled
with a SCSI status code (not necessarily "GOOD").
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_OVERRUN if the content of the CDB requires transferring more
data than is available in the data buffers.
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_ABORTED if the request was cancelled due to an ABORT TASK
or ABORT TASK SET task management function.
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_BAD_TARGET if the request was never processed because the
target indicated by the
\series bold
lun
\series default
field does not exist.
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_RESET if the request was cancelled due to a bus or device
reset (including a task management function).
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_TRANSPORT_FAILURE if the request failed due to a problem in
the connection between the host and the target (severed link).
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_TARGET_FAILURE if the target is suffering a failure and the
guest should not retry on other paths.
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_NEXUS_FAILURE if the nexus is suffering a failure but retrying
on other paths might yield a different result.
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_BUSY if the request failed but retrying on the same path should
work.
\end_layout
\begin_layout Description
VIRTIO_SCSI_S_FAILURE for other host or guest error.
In particular, if neither dataout nor datain is empty, and the VIRTIO_SCSI_F_IN
OUT feature has not been negotiated, the request will be immediately returned
with a response equal to VIRTIO_SCSI_S_FAILURE.
\end_layout
\begin_layout Section*
Device Operation: controlq
\end_layout
\begin_layout Standard
The controlq is used for other SCSI transport operations.
Requests have the following format:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_scsi_ctrl {
\end_layout
\begin_layout Plain Layout
u32 type;
\end_layout
\begin_layout Plain Layout
...
\end_layout
\begin_layout Plain Layout
u8 response;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* response values valid for all commands */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_OK 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_BAD_TARGET 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_BUSY 5
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_TARGET_FAILURE 7
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_NEXUS_FAILURE 8
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_FAILURE 9
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_INCORRECT_LUN 12
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The
\series bold
type
\series default
identifies the remaining fields.
\end_layout
\begin_layout Standard
The following commands are defined:
\end_layout
\begin_layout Description
Task
\begin_inset space \space{}
\end_inset
management
\begin_inset space \space{}
\end_inset
function
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF 0
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct virtio_scsi_ctrl_tmf
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
// Read-only part
\end_layout
\begin_layout Plain Layout
u32 type;
\end_layout
\begin_layout Plain Layout
u32 subtype;
\end_layout
\begin_layout Plain Layout
u8 lun[8];
\end_layout
\begin_layout Plain Layout
u64 id;
\end_layout
\begin_layout Plain Layout
// Write-only part
\end_layout
\begin_layout Plain Layout
u8 response;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* command-specific response values */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11
\end_layout
\end_inset
\end_layout
\begin_deeper
\begin_layout Standard
The type is VIRTIO_SCSI_T_TMF; the subtype field defines.
All fields except
\series bold
response
\series default
are filled by the driver.
The
\series bold
subtype
\series default
field must always be specified and identifies the requested task management
function.
\end_layout
\begin_layout Standard
Other fields may be irrelevant for the requested TMF; if so, they are ignored
but they should still be present.
The
\series bold
lun
\series default
field is in the same format specified for request queues; the single level
LUN is ignored when the task management function addresses a whole I_T
nexus.
When relevant, the value of the
\series bold
id
\series default
field is matched against the id values passed on the requestq.
\end_layout
\begin_layout Standard
The outcome of the task management function is written by the device in
the response field.
The command-specific response values map 1-to-1 with those defined in SAM.
\end_layout
\end_deeper
\begin_layout Description
Asynchronous
\begin_inset space \space{}
\end_inset
notification
\begin_inset space \space{}
\end_inset
query
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_AN_QUERY 1
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct virtio_scsi_ctrl_an {
\end_layout
\begin_layout Plain Layout
// Read-only part
\end_layout
\begin_layout Plain Layout
u32 type;
\end_layout
\begin_layout Plain Layout
u8 lun[8];
\end_layout
\begin_layout Plain Layout
u32 event_requested;
\end_layout
\begin_layout Plain Layout
// Write-only part
\end_layout
\begin_layout Plain Layout
u32 event_actual;
\end_layout
\begin_layout Plain Layout
u8 response;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64
\end_layout
\end_inset
\end_layout
\begin_deeper
\begin_layout Standard
By sending this command, the driver asks the device which events the given
LUN can report, as described in paragraphs 6.6 and A.6 of the SCSI MMC specificat
ion.
The driver writes the events it is interested in into the event_requested;
the device responds by writing the events that it supports into event_actual.
\end_layout
\begin_layout Standard
The
\series bold
type
\series default
is VIRTIO_SCSI_T_AN_QUERY.
The
\series bold
lun
\series default
and
\series bold
event_requested
\series default
fields are written by the driver.
The
\series bold
event_actual
\series default
and
\series bold
response
\series default
fields are written by the device.
\end_layout
\begin_layout Standard
No command-specific values are defined for the response byte.
\end_layout
\end_deeper
\begin_layout Description
Asynchronous
\begin_inset space \space{}
\end_inset
notification
\begin_inset space \space{}
\end_inset
subscription
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct virtio_scsi_ctrl_an {
\end_layout
\begin_layout Plain Layout
// Read-only part
\end_layout
\begin_layout Plain Layout
u32 type;
\end_layout
\begin_layout Plain Layout
u8 lun[8];
\end_layout
\begin_layout Plain Layout
u32 event_requested;
\end_layout
\begin_layout Plain Layout
// Write-only part
\end_layout
\begin_layout Plain Layout
u32 event_actual;
\end_layout
\begin_layout Plain Layout
u8 response;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_deeper
\begin_layout Standard
By sending this command, the driver asks the specified LUN to report events
for its physical interface, again as described in the SCSI MMC specification.
The driver writes the events it is interested in into the event_requested;
the device responds by writing the events that it supports into event_actual.
\end_layout
\begin_layout Standard
Event types are the same as for the asynchronous notification query message.
\end_layout
\begin_layout Standard
The
\series bold
type
\series default
is VIRTIO_SCSI_T_AN_SUBSCRIBE.
The
\series bold
lun
\series default
and
\series bold
event_requested
\series default
fields are written by the driver.
The
\series bold
event_actual
\series default
and
\series bold
response
\series default
fields are written by the device.
\end_layout
\begin_layout Standard
No command-specific values are defined for the response byte.
\end_layout
\end_deeper
\begin_layout Section*
Device Operation: eventq
\end_layout
\begin_layout Standard
The eventq is used by the device to report information on logical units
that are attached to it.
The driver should always leave a few buffers ready in the eventq.
In general, the device will not queue events to cope with an empty eventq,
and will end up dropping events if it finds no buffer ready.
However, when reporting events for many LUNs (e.g.
when a whole target disappears), the device can throttle events to avoid
dropping them.
For this reason, placing 10-15 buffers on the event queue should be enough.
\end_layout
\begin_layout Standard
Buffers are placed in the eventq and filled by the device when interesting
events occur.
The buffers should be strictly write-only (device-filled) and the size
of the buffers should be at least the value given in the device's configuration
information.
\end_layout
\begin_layout Standard
Buffers returned by the device on the eventq will be referred to as "events"
in the rest of this section.
Events have the following format:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct virtio_scsi_event {
\end_layout
\begin_layout Plain Layout
// Write-only part
\end_layout
\begin_layout Plain Layout
u32 event;
\end_layout
\begin_layout Plain Layout
...
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_layout Standard
If bit 31 is set in the event field, the device failed to report an event
due to missing buffers.
In this case, the driver should poll the logical units for unit attention
conditions, and/or do whatever form of bus scan is appropriate for the
guest operating system.
\end_layout
\begin_layout Standard
Other data that the device writes to the buffer depends on the contents
of the event field.
The following events are defined:
\end_layout
\begin_layout Description
No
\begin_inset space \space{}
\end_inset
event
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_NO_EVENT 0
\end_layout
\end_inset
\end_layout
\begin_deeper
\begin_layout Standard
This event is fired in the following cases:
\end_layout
\begin_layout Itemize
When the device detects in the eventq a buffer that is shorter than what
is indicated in the configuration field, it might use it immediately and
put this dummy value in the event field.
A well-written driver will never observe this situation.
\end_layout
\begin_layout Itemize
When events are dropped, the device may signal this event as soon as the
drivers makes a buffer available, in order to request action from the driver.
In this case, of course, this event will be reported with the VIRTIO_SCSI_T_EVE
NTS_MISSED flag.
\end_layout
\end_deeper
\begin_layout Description
Transport
\begin_inset space \space{}
\end_inset
reset
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_TRANSPORT_RESET 1
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct virtio_scsi_event_reset {
\end_layout
\begin_layout Plain Layout
// Write-only part
\end_layout
\begin_layout Plain Layout
u32 event;
\end_layout
\begin_layout Plain Layout
u8 lun[8];
\end_layout
\begin_layout Plain Layout
u32 reason;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_RESET_HARD 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_RESET_RESCAN 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_SCSI_EVT_RESET_REMOVED 2
\end_layout
\end_inset
\end_layout
\begin_deeper
\begin_layout Standard
By sending this event, the device signals that a logical unit on a target
has been reset, including the case of a new device appearing or disappearing
on the bus.The device fills in all fields.
The
\series bold
event
\series default
field is set to VIRTIO_SCSI_T_TRANSPORT_RESET.
The
\series bold
lun
\series default
field addresses a logical unit in the SCSI host.
\end_layout
\begin_layout Standard
The
\series bold
reason
\series default
value is one of the three #define values appearing above:
\end_layout
\begin_layout Itemize
\series bold
VIRTIO_SCSI_EVT_RESET_REMOVED
\series default
(
\begin_inset Quotes eld
\end_inset
LUN/target removed
\begin_inset Quotes erd
\end_inset
) is used if the target or logical unit is no longer able to receive commands.
\end_layout
\begin_layout Itemize
\series bold
VIRTIO_SCSI_EVT_RESET_HARD
\series default
(
\begin_inset Quotes eld
\end_inset
LUN hard reset
\begin_inset Quotes erd
\end_inset
) is used if the logical unit has been reset, but is still present.
\end_layout
\begin_layout Itemize
\series bold
VIRTIO_SCSI_EVT_RESET_RESCAN
\series default
(
\begin_inset Quotes eld
\end_inset
rescan LUN/target
\begin_inset Quotes erd
\end_inset
) is used if a target or logical unit has just appeared on the device.
\end_layout
\begin_layout Standard
The
\begin_inset Quotes eld
\end_inset
removed
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
rescan
\begin_inset Quotes erd
\end_inset
events, when sent for LUN 0, may apply to the entire target.
After receiving them the driver should ask the initiator to rescan the
target, in order to detect the case when an entire target has appeared
or disappeared.
These two events will never be reported unless the
\series bold
VIRTIO_SCSI_F_HOTPLUG
\series default
feature was negotiated between the host and the guest.
\end_layout
\begin_layout Standard
Events will also be reported via sense codes (this obviously does not apply
to newly appeared buses or targets, since the application has never discovered
them):
\end_layout
\begin_layout Itemize
\begin_inset Quotes eld
\end_inset
LUN/target removed
\begin_inset Quotes erd
\end_inset
maps to sense key ILLEGAL REQUEST, asc 0x25, ascq 0x00 (LOGICAL UNIT NOT
SUPPORTED)
\end_layout
\begin_layout Itemize
\begin_inset Quotes eld
\end_inset
LUN hard reset
\begin_inset Quotes erd
\end_inset
maps to sense key UNIT ATTENTION, asc 0x29 (POWER ON, RESET OR BUS DEVICE
RESET OCCURRED)
\end_layout
\begin_layout Itemize
\begin_inset Quotes eld
\end_inset
rescan LUN/target
\begin_inset Quotes erd
\end_inset
maps to sense key UNIT ATTENTION, asc 0x3f, ascq 0x0e (REPORTED LUNS DATA
HAS CHANGED)
\end_layout
\begin_layout Standard
The preferred way to detect transport reset is always to use events, because
sense codes are only seen by the driver when it sends a SCSI command to
the logical unit or target.
However, in case events are dropped, the initiator will still be able to
synchronize with the actual state of the controller if the driver asks
the initiator to rescan of the SCSI bus.
During the rescan, the initiator will be able to observe the above sense
codes, and it will process them as if it the driver had received the equivalent
event.
\end_layout
\end_deeper
\begin_layout Description
Asynchronous
\begin_inset space \space{}
\end_inset
notification
\begin_inset space ~
\end_inset
\begin_inset Newline newline
\end_inset
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct virtio_scsi_event_an {
\end_layout
\begin_layout Plain Layout
// Write-only part
\end_layout
\begin_layout Plain Layout
u32 event;
\end_layout
\begin_layout Plain Layout
u8 lun[8];
\end_layout
\begin_layout Plain Layout
u32 reason;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_deeper
\begin_layout Standard
By sending this event, the device signals that an asynchronous event was
fired from a physical interface.
\end_layout
\begin_layout Standard
All fields are written by the device.
The
\series bold
event
\series default
field is set to VIRTIO_SCSI_T_ASYNC_NOTIFY.
The
\series bold
lun
\series default
field addresses a logical unit in the SCSI host.
The
\series bold
reason
\series default
field is a subset of the events that the driver has subscribed to via the
"Asynchronous notification subscription" command.
\end_layout
\begin_layout Standard
When dropped events are reported, the driver should poll for asynchronous
events manually using SCSI commands.
\end_layout
\end_deeper
\begin_layout Chapter*
Appendix X: virtio-mmio
\end_layout
\begin_layout Standard
Virtual environments without PCI support (a common situation in embedded
devices models) might use simple memory mapped device (
\begin_inset Quotes eld
\end_inset
virtio-mmio
\begin_inset Quotes erd
\end_inset
) instead of the PCI device.
\end_layout
\begin_layout Standard
The memory mapped virtio device behaviour is based on the PCI device specificati
on.
Therefore most of operations like device initialization, queues configuration
and buffer transfers are nearly identical.
Existing differences are described in the following sections.
\end_layout
\begin_layout Subsection*
Device Initialization
\end_layout
\begin_layout Standard
Instead of using the PCI IO space for virtio header, the
\begin_inset Quotes eld
\end_inset
virtio-mmio
\begin_inset Quotes erd
\end_inset
device provides a set of memory mapped control registers, all 32 bits wide,
followed by device-specific configuration space.
The following list presents their layout:
\end_layout
\begin_layout Itemize
Offset from the device base address | Direction | Name
\begin_inset Newline newline
\end_inset
Description
\end_layout
\begin_layout Itemize
0x000 | R | MagicValue
\begin_inset Newline newline
\end_inset
\begin_inset Quotes eld
\end_inset
virt
\begin_inset Quotes erd
\end_inset
string.
\end_layout
\begin_layout Itemize
0x004 | R | Version
\begin_inset Newline newline
\end_inset
Device version number.
Currently must be 1.
\end_layout
\begin_layout Itemize
0x008 | R | DeviceID
\begin_inset Newline newline
\end_inset
Virtio Subsystem Device ID (ie.
1 for network card).
\end_layout
\begin_layout Itemize
0x00c | R | VendorID
\begin_inset Newline newline
\end_inset
Virtio Subsystem Vendor ID.
\end_layout
\begin_layout Itemize
0x010 | R | HostFeatures
\begin_inset Newline newline
\end_inset
Flags representing features the device supports.
\begin_inset Newline newline
\end_inset
Reading from this register returns 32 consecutive flag bits, first bit
depending on the last value written to HostFeaturesSel register.
Access to this register returns bits
\begin_inset Formula $HostFeaturesSel*32$
\end_inset
to
\begin_inset Formula $(HostFeaturesSel*32)+31$
\end_inset
, eg.
feature bits 0 to 31 if HostFeaturesSel is set to 0 and features bits 32
to 63 if HostFeaturesSel is set to 1.
Also see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Feature-Bits"
\end_inset
\end_layout
\begin_layout Itemize
0x014 | W | HostFeaturesSel
\begin_inset Newline newline
\end_inset
Device (Host) features word selection.
\begin_inset Newline newline
\end_inset
Writing to this register selects a set of 32 device feature bits accessible
by reading from HostFeatures register.
Device driver must write a value to the HostFeaturesSel register before
reading from the HostFeatures register.
\end_layout
\begin_layout Itemize
0x020 | W | GuestFeatures
\begin_inset Newline newline
\end_inset
Flags representing device features understood and activated by the driver.
\begin_inset Newline newline
\end_inset
Writing to this register sets 32 consecutive flag bits, first bit depending
on the last value written to GuestFeaturesSel register.
Access to this register sets bits
\begin_inset Formula $GuestFeaturesSel*32$
\end_inset
to
\begin_inset Formula $(GuestFeaturesSel*32)+31$
\end_inset
, eg.
feature bits 0 to 31 if GuestFeaturesSel is set to 0 and features bits
32 to 63 if GuestFeaturesSel is set to 1.
Also see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Feature-Bits"
\end_inset
\end_layout
\begin_layout Itemize
0x024 | W | GuestFeaturesSel
\begin_inset Newline newline
\end_inset
Activated (Guest) features word selection.
\begin_inset Newline newline
\end_inset
Writing to this register selects a set of 32 activated feature bits accessible
by writing to the GuestFeatures register.
Device driver must write a value to the GuestFeaturesSel register before
writing to the GuestFeatures register.
\end_layout
\begin_layout Itemize
0x028 | W | GuestPageSize
\begin_inset Newline newline
\end_inset
Guest page size.
\begin_inset Newline newline
\end_inset
Device driver must write the guest page size in bytes to the register during
initialization, before any queues are used.
This value must be a power of 2 and is used by the Host to calculate Guest
address of the first queue page (see QueuePFN).
\end_layout
\begin_layout Itemize
0x030 | W | QueueSel
\begin_inset Newline newline
\end_inset
Virtual queue index (first queue is 0).
\begin_inset Newline newline
\end_inset
Writing to this register selects the virtual queue that the following operation
s on QueueNum, QueueAlign and QueuePFN apply to.
\end_layout
\begin_layout Itemize
0x034 | R | QueueNumMax
\begin_inset Newline newline
\end_inset
Maximum virtual queue size.
\begin_inset Newline newline
\end_inset
Reading from the register returns the maximum size of the queue the Host
is ready to process or zero (0x0) if the queue is not available.
This applies to the queue selected by writing to QueueSel and is allowed
only when QueuePFN is set to zero (0x0), so when the queue is not actively
used.
\end_layout
\begin_layout Itemize
0x038 | W | QueueNum
\begin_inset Newline newline
\end_inset
Virtual queue size.
\begin_inset Newline newline
\end_inset
Queue size is a number of elements in the queue, therefore size of the
descriptor table and both available and used rings.
\begin_inset Newline newline
\end_inset
Writing to this register notifies the Host what size of the queue the Guest
will use.
This applies to the queue selected by writing to QueueSel.
\end_layout
\begin_layout Itemize
0x03c | W | QueueAlign
\begin_inset Newline newline
\end_inset
Used Ring alignment in the virtual queue.
\begin_inset Newline newline
\end_inset
Writing to this register notifies the Host about alignment boundary of
the Used Ring in bytes.
This value must be a power of 2 and applies to the queue selected by writing
to QueueSel.
\end_layout
\begin_layout Itemize
0x040 | RW | QueuePFN
\begin_inset Newline newline
\end_inset
Guest physical page number of the virtual queue.
\begin_inset Newline newline
\end_inset
Writing to this register notifies the host about location of the virtual
queue in the Guest's physical address space.
This value is the index number of a page starting with the queue Descriptor
Table.
Value zero (0x0) means physical address zero (0x00000000) and is illegal.
When the Guest stops using the queue it must write zero (0x0) to this register.
\begin_inset Newline newline
\end_inset
Reading from this register returns the currently used page number of the
queue, therefore a value other than zero (0x0) means that the queue is
in use.
\begin_inset Newline newline
\end_inset
Both read and write accesses apply to the queue selected by writing to
QueueSel.
\end_layout
\begin_layout Itemize
0x050 | W | QueueNotify
\begin_inset Newline newline
\end_inset
Queue notifier.
\begin_inset Newline newline
\end_inset
Writing a queue index to this register notifies the Host that there are
new buffers to process in the queue.
\end_layout
\begin_layout Itemize
0x60 | R | InterruptStatus
\begin_inset Newline newline
\end_inset
Interrupt status.
\begin_inset Newline newline
\end_inset
Reading from this register returns a bit mask of interrupts asserted by
the device.
An interrupt is asserted if the corresponding bit is set, ie.
equals one (1).
\end_layout
\begin_deeper
\begin_layout Itemize
Bit 0 | Used Ring Update
\begin_inset Newline newline
\end_inset
This interrupt is asserted when the Host has updated the Used Ring in at
least one of the active virtual queues.
\end_layout
\begin_layout Itemize
Bit 1 | Configuration change
\begin_inset Newline newline
\end_inset
This interrupt is asserted when configuration of the device has changed.
\end_layout
\end_deeper
\begin_layout Itemize
0x064 | W | InterruptACK
\begin_inset Newline newline
\end_inset
Interrupt acknowledge.
\begin_inset Newline newline
\end_inset
Writing to this register notifies the Host that the Guest finished handling
interrupts.
Set bits in the value clear the corresponding bits of the InterruptStatus
register.
\end_layout
\begin_layout Itemize
0x070 | RW | Status
\begin_inset Newline newline
\end_inset
Device status.
\begin_inset Newline newline
\end_inset
Reading from this register returns the current device status flags.
\begin_inset Newline newline
\end_inset
Writing non-zero values to this register sets the status flags, indicating
the Guest progress.
Writing zero (0x0) to this register triggers a device reset.
\begin_inset Newline newline
\end_inset
Also see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Device-Initialization-Sequence"
\end_inset
\end_layout
\begin_layout Itemize
0x100+ | RW | Config
\begin_inset Newline newline
\end_inset
Device-specific configuration space starts at an offset 0x100 and is accessed
with byte alignment.
Its meaning and size depends on the device and the driver.
\end_layout
\begin_layout Standard
Virtual queue size is a number of elements in the queue, therefore size
of the descriptor table and both available and used rings.
\end_layout
\begin_layout Standard
The endianness of the registers follows the native endianness of the Guest.
Writing to registers described as
\begin_inset Quotes eld
\end_inset
R
\begin_inset Quotes erd
\end_inset
and reading from registers described as
\begin_inset Quotes eld
\end_inset
W
\begin_inset Quotes erd
\end_inset
is not permitted and can cause undefined behavior.
\end_layout
\begin_layout Standard
The device initialization is performed as described in
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Device-Initialization-Sequence"
\end_inset
with one exception: the Guest must notify the Host about its page size,
writing the size in bytes to GuestPageSize register before the initialization
is finished.
\end_layout
\begin_layout Standard
The memory mapped virtio devices generate single interrupt only, therefore
no special configuration is required.
\end_layout
\begin_layout Subsection*
Virtqueue Configuration
\end_layout
\begin_layout Standard
The virtual queue configuration is performed in a similar way to the one
described in
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Virtqueue-Configuration"
\end_inset
with a few additional operations:
\end_layout
\begin_layout Enumerate
Select the queue writing its index (first queue is 0) to the QueueSel register.
\end_layout
\begin_layout Enumerate
Check if the queue is not already in use: read QueuePFN register, returned
value should be zero (0x0).
\end_layout
\begin_layout Enumerate
Read maximum queue size (number of elements) from the QueueNumMax register.
If the returned value is zero (0x0) the queue is not available.
\end_layout
\begin_layout Enumerate
Allocate and zero the queue pages in contiguous virtual memory, aligning
the Used Ring to an optimal boundary (usually page size).
Size of the allocated queue may be smaller than or equal to the maximum
size returned by the Host.
\end_layout
\begin_layout Enumerate
Notify the Host about the queue size by writing the size to QueueNum register.
\end_layout
\begin_layout Enumerate
Notify the Host about the used alignment by writing its value in bytes to
QueueAlign register.
\end_layout
\begin_layout Enumerate
Write the physical number of the first page of the queue to the QueuePFN
register.
\end_layout
\begin_layout Standard
The queue and the device are ready to begin normal operations now.
\end_layout
\begin_layout Subsection*
Device Operation
\end_layout
\begin_layout Standard
The memory mapped virtio device behaves in the same way as described in
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Device-Operation"
\end_inset
, with the following exceptions:
\end_layout
\begin_layout Enumerate
The device is notified about new buffers available in a queue by writing
the queue index to register QueueNum instead of the virtio header in PCI
I/O space (
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Notifying-The-Device"
\end_inset
).
\end_layout
\begin_layout Enumerate
The memory mapped virtio device is using single, dedicated interrupt signal,
which is raised when at least one of the interrupts described in the InterruptS
tatus register description is asserted.
After receiving an interrupt, the driver must read the InterruptStatus
register to check what caused the interrupt (see the register description).
After the interrupt is handled, the driver must acknowledge it by writing
a bit mask corresponding to the serviced interrupt to the InterruptACK
register.
\end_layout
\end_body
\end_document