blob: 8fee4a98e2aec8ee55792eabcbce4a4693ab78e6 [file] [log] [blame]
#LyX 1.6.4 created this file. For more info see http://www.lyx.org/
\lyxformat 345
\begin_document
\begin_header
\textclass report
\use_default_options false
\language english
\inputencoding auto
\font_roman default
\font_sans default
\font_typewriter default
\font_default_family default
\font_sc false
\font_osf false
\font_sf_scale 100
\font_tt_scale 100
\graphics default
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_amsmath 1
\use_esint 1
\cite_engine basic
\use_bibtopic false
\paperorientation portrait
\secnumdepth 3
\tocdepth 3
\paragraph_separation skip
\defskip medskip
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes true
\output_changes true
\author ""
\author ""
\author ""
\end_header
\begin_body
\begin_layout Title
Virtio PCI Card Specification
\begin_inset Newline newline
\end_inset
v0.8.8 DRAFT
\begin_inset Newline newline
\end_inset
-
\end_layout
\begin_layout Author
Rusty Russell <rusty@rustcorp.com.au>
\begin_inset Newline newline
\end_inset
IBM Corporation
\end_layout
\begin_layout Date
2010 April 29.
\end_layout
\begin_layout Chapter
Purpose and Description
\end_layout
\begin_layout Standard
This document describes the specifications of the
\begin_inset Quotes eld
\end_inset
virtio
\begin_inset Quotes erd
\end_inset
family of
\emph on
PCI
\emph default
\begin_inset CommandInset nomenclature
LatexCommand nomenclature
symbol "PCI"
description "Peripheral Component Interconnect; a common device bus. See\\\\http://en.wikipedia.org/wiki/Peripheral Component Interconnect"
\end_inset
devices.
These are devices are found in
\emph on
virtual
\emph default
\emph on
environments
\begin_inset CommandInset nomenclature
LatexCommand nomenclature
symbol "virtualized"
description "Environments where access to hardware is restricted (and often emulated) by a hypervisor."
\end_inset
\emph default
, yet by design they are not all that different from physical PCI devices,
and this document treats them as such.
This allows the guest to use standard PCI drivers and discovery mechanisms.
\end_layout
\begin_layout Standard
The purpose of virtio and this specification is that virtual environments
and guests should have a straightforward, efficient, standard and extensible
mechanism for virtual devices, rather than boutique per-environment or
per-OS mechanisms.
\end_layout
\begin_layout Description
Straightforward: Virtio PCI devices use normal PCI mechanisms of interrupts
and DMA which should be familiar to any device driver author.
There is no exotic page-flipping or COW mechanism: it's just a PCI device.
\begin_inset Foot
status open
\begin_layout Plain Layout
This lack of page-sharing implies that the implementation of the device
(e.g.
the hypervisor or host) needs full access to the guest memory.
Communication with untrusted parties (i.e.
inter-guest communication) requires copying.
\end_layout
\end_inset
\end_layout
\begin_layout Description
Efficient: Virtio PCI devices consist of rings of descriptors for input
and output, which are neatly separated to avoid cache effects from both
guest and device writing to the same cache lines.
\end_layout
\begin_layout Description
Standard: Virtio PCI makes no assumptions about the environment in which
it operates, beyond supporting PCI.
In fact the virtio devices specified in the appendices do not require PCI
at all: they have been implemented on non-PCI buses.
\begin_inset Foot
status open
\begin_layout Plain Layout
The Linux implementation further separates the PCI virtio code from the
specific virtio drivers: these drivers are shared with the non-PCI implementati
ons (currently lguest and S/390).
\end_layout
\end_inset
\end_layout
\begin_layout Description
Extensible: Virtio PCI devices contain feature bits which are acknowledged
by the guest operating system during device setup.
This allows forwards and backwards compatibility: the device offers all
the features it knows about, and the driver acknowledges those it understands
and wishes to use.
\end_layout
\begin_layout Section
Virtqueues
\end_layout
\begin_layout Standard
The mechanism for bulk data transport on virtio PCI devices is pretentiously
called a virtqueue.
Each device can have zero or more virtqueues: for example, the network
device has one for transmit and one for receive.
\end_layout
\begin_layout Standard
Each virtqueue occupies two or more physically-contiguous pages (defined,
for the purposes of this specification, as 4096 bytes), and consists of
three parts:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="1" columns="4">
<features>
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Descriptor Table
\end_layout
\end_inset
</cell>
<cell multicolumn="1" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Available Ring
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\emph on
(padding)
\end_layout
\end_inset
</cell>
<cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Used Ring
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
When the driver wants to send buffers to the device, it puts them in one
or more slots in the descriptor table, and writes the descriptor indices
into the available ring.
It then notifies the device.
When the device has finished with the buffers, it writes the descriptors
into the used ring, and sends an interrupt.
\end_layout
\begin_layout Chapter
Specification
\end_layout
\begin_layout Section
PCI Discovery
\end_layout
\begin_layout Standard
Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 through 0x103F
inclusive is a virtio device
\begin_inset Foot
status open
\begin_layout Plain Layout
The actual value within this range is ignored
\end_layout
\end_inset
.
The device must also have a Revision ID of 0 to match this specification.
\end_layout
\begin_layout Standard
The Subsystem Device ID indicates which virtio device is supported by the
device.
The Subsystem Vendor ID should reflect the PCI Vendor ID of the environment
(it's currently only used for informational purposes by the guest).
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="7" columns="3">
<features>
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="bottom" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Subsystem Device ID
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Virtio Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Specification
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
1
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
network card
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix C
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
2
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
block device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix D
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
3
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
console
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix E
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
4
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
entropy source
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix F
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
5
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
memory ballooning
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix G
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
9
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
9P transport
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
-
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Section
Device Configuration
\end_layout
\begin_layout Standard
To configure the device, we use the first I/O region of the PCI device.
This contains a
\emph on
virtio header
\emph default
followed by a
\emph on
device-specific region.
\end_layout
\begin_layout Standard
There may be different widths of accesses to the I/O region; the
\begin_inset Quotes eld
\end_inset
natural
\begin_inset Quotes erd
\end_inset
access method for each field in the virtio header must be used (i.e.
32-bit accesses for 32-bit fields, etc), but the device-specific region
can be accessed using any width accesses, and should obtain the same results.
\end_layout
\begin_layout Standard
Note that this is possible because while the virtio header is PCI (i.e.
little) endian, the device-specific region is encoded in the native endian
of the guest (where such distinction is applicable).
\end_layout
\begin_layout Subsection
Device Initialization Sequence
\end_layout
\begin_layout Standard
We start with an overview of device initialization, then expand on the details
of the device and how each step is preformed.
\end_layout
\begin_layout Enumerate
Reset the device.
This is not required on initial start up.
\end_layout
\begin_layout Enumerate
The ACKNOWLEDGE status bit is set: we have noticed the device.
\end_layout
\begin_layout Enumerate
The DRIVER status bit is set: we know how to drive the device.
\end_layout
\begin_layout Enumerate
Device-specific setup, including reading the Device Feature Bits, discovery
of virtqueues for the device, optional MSI-X setup, and reading and possibly
writing the virtio configuration space.
\end_layout
\begin_layout Enumerate
The subset of Device Feature Bits understood by the driver is written to
the device.
\end_layout
\begin_layout Enumerate
The DRIVER_OK status bit is set.
\end_layout
\begin_layout Enumerate
The device can now be used (ie.
buffers added to the virtqueues)
\begin_inset Foot
status open
\begin_layout Plain Layout
Historically, drivers have used the device before steps 5 and 6.
This is only allowed if the driver does not use any features which would
alter this early use of the device.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
If any of these steps go irrecoverably wrong, the guest should set the FAILED
status bit to indicate that it has given up on the device (it can reset
the device later to restart if desired).
\end_layout
\begin_layout Standard
We now cover the fields required for general setup in detail.
\end_layout
\begin_layout Subsection
Virtio Header
\end_layout
\begin_layout Standard
The virtio header looks as follows:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="9">
<features>
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
8
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
8
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Guest
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
ISR
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Features
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Features
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Address
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Size
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Select
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Notify
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Status
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Status
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
If MSI-X is enabled for the device, two additional fields immediately follow
this header:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="3">
<features>
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Configuration
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
(MSI-X)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Vector
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Vector
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
Immediately following these general headers, there may be device-specific
headers:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="2">
<features>
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Device Specific
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Device Specific
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device Specific...
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Subsubsection
Device Status
\end_layout
\begin_layout Standard
The Device Status field is updated by the guest to indicate its progress.
This provides a simple low-level diagnostic: it's most useful to imagine
them hooked up to traffic lights on the console indicating the status of
each device.
\end_layout
\begin_layout Standard
The device can be reset by writing a 0 to this field, otherwise at least
one bit should be set:
\end_layout
\begin_layout Description
ACKNOWLEDGE
\begin_inset space ~
\end_inset
(1) Indicates that the guest OS has found the device and recognized it as
a valid virtio device.
\end_layout
\begin_layout Description
DRIVER
\begin_inset space ~
\end_inset
(2) Indicates that the guest OS knows how to drive the device.
Under Linux, drivers can be loadable modules so there may be a significant
(or infinite) delay before setting this bit.
\end_layout
\begin_layout Description
DRIVER_OK
\begin_inset space ~
\end_inset
(3) Indicates that the driver is set up and ready to drive the device.
\end_layout
\begin_layout Description
FAILED
\begin_inset space ~
\end_inset
(128) Indicates that something went wrong in the guest, and it has given
up on the device.
This could be an internal error, or the driver didn't like the device for
some reason, or even a fatal error during device operation.
The device must be reset before attempting to re-initialize.
\end_layout
\begin_layout Subsubsection
Feature Bits
\end_layout
\begin_layout Standard
The least significant 31 bits of the first configuration field indicates
the features that the device supports (the high bit is reserved, and will
be used to indicate the presence of future feature bits elsewhere).
The bits are allocated as follows:
\end_layout
\begin_layout Description
0
\begin_inset space ~
\end_inset
to
\begin_inset space ~
\end_inset
23 Feature bits for the specific device type
\end_layout
\begin_layout Description
24
\begin_inset space \space{}
\end_inset
to
\begin_inset space ~
\end_inset
30 Feature bits reserved for extensions to the queue mechanism
\end_layout
\begin_layout Standard
For example, feature bit 0 for a network device (i.e.
Subsystem Device ID 1) indicates that the device supports checksumming
of packets.
\end_layout
\begin_layout Standard
The feature bits are
\emph on
negotiated:
\emph default
the device lists all the features it understands in the Device Features
field, and the guest writes the subset that it understands into the Guest
Features field.
The only way to renegotiate is to reset the device.
\end_layout
\begin_layout Standard
In particular, new fields in the device configuration header are indicated
by offering a feature bit, so the guest can check before accessing that
part of the configuration space.
\end_layout
\begin_layout Standard
This allows for forwards and backwards compatibility: if the device is enhanced
with a new feature bit, older guests will not write that feature bit back
to the Guest Features field and it can go into backwards compatibility
mode.
Similarly, if a guest is enhanced with a feature that the device doesn't
support, it will not see that feature bit in the Device Features field
and can go into backwards compatibility mode (or, for poor implementations,
set the FAILED Device Status bit).
\end_layout
\begin_layout Subsubsection
Configuration/Queue Vectors
\end_layout
\begin_layout Standard
When MSI-X capability is present and enabled in the device (through standard
PCI configuration space) 4 bytes at byte offset 20 are used to map configuratio
n change and queue interrupts to MSI-X vectors.
In this case, the ISR Status field is unused, and device specific configuration
starts at byte offset 24 in virtio header structure.
When MSI-X capability is not enabled, device specific configuration starts
at byte offset 20 in virtio header.
\end_layout
\begin_layout Standard
Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of Configuration/Qu
eue Vector registers,
\emph on
maps
\emph default
interrupts triggered by the configuration change/selected queue events
respectively to the corresponding MSI-X vector.
To disable interrupts for a specific event type, unmap it by writing a
special NO_VECTOR value:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
/* Vector value used to disable MSI for queue */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_MSI_NO_VECTOR 0xffff
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Reading these registers returns vector mapped to a given event, or NO_VECTOR
if unmapped.
All queue and configuration change events are unmapped by default.
\end_layout
\begin_layout Standard
Note that mapping an event to vector might require allocating internal device
resources, and might fail.
Devices report such failures by returning the NO_VECTOR value when the
relevant Vector field is read.
After mapping an event to vector, the driver must verify success by reading
the Vector field value: on success, the previously written value is returned,
and on failure, NO_VECTOR is returned.
If a mapping failure is detected, the driver can retry mapping with fewervector
s, or disable MSI-X.
\end_layout
\begin_layout Section
Virtqueue Configuration
\end_layout
\begin_layout Standard
As a device can have zero or more virtqueues for bulk data transport (for
example, the network driver has two), the driver needs to configure them
as part of the device-specific configuration.
\end_layout
\begin_layout Standard
This is done as follows, for each virtqueue a device has:
\end_layout
\begin_layout Enumerate
Write the virtqueue index (first queue is 0) to the Queue Select field.
\end_layout
\begin_layout Enumerate
Read the virtqueue size from the Queue Size field, which is always a power
of 2.
This controls how big the virtqueue is (see below).
If this field is 0, the virtqueue does not exist.
\end_layout
\begin_layout Enumerate
Allocate and zero virtqueue in contiguous physical memory, on a 4096 byte
alignment.
Write the physical address, divided by 4096 to the Queue Address field.
\begin_inset Foot
status open
\begin_layout Plain Layout
The 4096 is based on the x86 page size, but it's also large enough to ensure
that the separate parts of the virtqueue are on separate cache lines.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
Optionally, if MSI-X capability is present and enabled on the device, select
a vector to use to request interrupts triggered by virtqueue events.
Write the MSI-X Table entry number corresponding to this vector in Queue
Vector field.
Read the Queue Vector field: on success, previously written value is returned;
on failure, NO_VECTOR value is returned.
\end_layout
\begin_layout Standard
The Queue Size field controls the total number of bytes required for the
virtqueue according to the following formula:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define ALIGN(x) (((x) + 4095) & ~4095)
\end_layout
\begin_layout Plain Layout
static inline unsigned vring_size(unsigned int qsz)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 + qsz))
\end_layout
\begin_layout Plain Layout
+ ALIGN(sizeof(struct vring_used_elem)*qsz);
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_layout Standard
This currently wastes some space with padding, but also allows future extensions.
The virtqueue layout structure looks like this (qsz is the Queue Size field,
which is a variable, so this code won't compile):
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring {
\end_layout
\begin_layout Plain Layout
/* The actual descriptors (16 bytes each) */
\end_layout
\begin_layout Plain Layout
struct vring_desc desc[qsz];
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* A ring of available descriptor heads with free-running index.
*/
\end_layout
\begin_layout Plain Layout
struct vring_avail avail;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
// Padding to the next 4096 boundary.
\end_layout
\begin_layout Plain Layout
char pad[];
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
// A ring of used descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
struct vring_used used;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
A Note on Virtqueue Endianness
\end_layout
\begin_layout Standard
Note that the
\emph on
endian
\emph default
of these fields and everything else in the virtqueue is the native endian
of the guest, not little-endian as PCI normally is.
This makes for simpler guest code, and it is assumed that the host already
has to be deeply aware of the guest endian so such an
\begin_inset Quotes eld
\end_inset
endian-aware
\begin_inset Quotes erd
\end_inset
device is not a significant issue.
\end_layout
\begin_layout Subsection
Descriptor Table
\end_layout
\begin_layout Standard
The descriptor table refers to the buffers the guest is using for the device.
The addresses are physical addresses, and the buffers can be chained via
the next field.
Each descriptor describes a buffer which is read-only or write-only, but
a chain of descriptors can contain both read-only and write-only buffers.
\end_layout
\begin_layout Standard
No descriptor chain may be more than 2^32 bytes long in total.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring_desc {
\end_layout
\begin_layout Plain Layout
/* Address (guest-physical).
*/
\end_layout
\begin_layout Plain Layout
u64 addr;
\end_layout
\begin_layout Plain Layout
/* Length.
*/
\end_layout
\begin_layout Plain Layout
u32 len;
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as continuing via the next field.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_NEXT 1
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as write-only (otherwise read-only).
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_WRITE 2
\end_layout
\begin_layout Plain Layout
/* This means the buffer contains a list of buffer descriptors.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_INDIRECT 4
\end_layout
\begin_layout Plain Layout
/* The flags as indicated above.
*/
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
/* Next field if flags & NEXT */
\end_layout
\begin_layout Plain Layout
u16 next;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The number of descriptors in the table is specified by the Queue Size field
for this virtqueue.
\begin_inset CommandInset label
LatexCommand label
name "sub:Indirect-Descriptors"
\end_inset
Indirect Descriptors
\end_layout
\begin_layout Standard
Some devices benefit by concurrently dispatching a large number of large
requests.
The VIRTIO_RING_F_INDIRECT_DESC feature can be used to allow this (see
\begin_inset CommandInset ref
LatexCommand ref
reference "cha:Reserved-Feature-Bits"
\end_inset
).
To increase ring capacity it is possible to store a table of
\emph on
indirect descriptors
\emph default
anywhere in memory, and insert a descriptor in main virtqueue (with flags&INDIR
ECT on) that refers to memory buffer containing this
\emph on
indirect descriptor table
\emph default
; fields
\emph on
addr
\emph default
and
\emph on
len
\emph default
refer to the indirect table address and length in bytes, respectively.
The indirect table layout structure looks like this (len is the length
of the descriptor that refers to this table, which is a variable, so this
code won't compile):
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct indirect_descriptor_table {
\end_layout
\begin_layout Plain Layout
/* The actual descriptors (16 bytes each) */
\end_layout
\begin_layout Plain Layout
struct vring_desc desc[len / 16];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The first indirect descriptor is located at start of the indirect descriptor
table (index 0), additional indirect descriptors are chained by next field.
An indirect descriptor without next field (with flags&NEXT off) signals
the end of the indirect descriptor table, and transfers control back to
the main virtqueue.
An indirect descriptor can not refer to another indirect descriptor table
(flags&INDIRECT must be off).
A single indirect descriptor table can include both read-only and write-only
descriptors; write-only flag (flags&WRITE) in the descriptor that refers
to it is ignored.
\end_layout
\begin_layout Subsection
Available Ring
\end_layout
\begin_layout Standard
The available ring refers to what descriptors we are offering the device:
it refers to the head of a descriptor chain.
The
\begin_inset Quotes eld
\end_inset
flags
\begin_inset Quotes erd
\end_inset
field is currently 0 or 1: 1 indicating that we do not need an interrupt
when the device consumes a descriptor from the available ring.
This interrupt suppression is merely an optimization; it may not suppress
interrupts entirely.
\end_layout
\begin_layout Standard
The
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field indicates where we would put the
\emph on
next
\emph default
descriptor entry (modulo the ring size).
This starts at 0, and increases.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring_avail {
\end_layout
\begin_layout Plain Layout
#define VRING_AVAIL_F_NO_INTERRUPT 1
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
u16 idx;
\end_layout
\begin_layout Plain Layout
u16 ring[qsz]; /* qsz is the Queue Size field read from device */
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Used Ring
\end_layout
\begin_layout Standard
The used ring is where the device returns buffers once it is done with them.
The flags field can be used by the device to hint that no notification
is necessary when the guest adds to the
\emph on
available
\emph default
ring (the flag is kept here because this is the only part of the virtqueue
written by the device).
\end_layout
\begin_layout Standard
Each entry in the ring is a pair: the head entry of the descriptor chain
describing the buffer (this matches an entry placed in the available ring
by the guest earlier), and the total of bytes written into the buffer.
The latter is extremely useful for guests using untrusted buffers: if you
do not know exactly how much has been written by the device, you usually
have to zero the buffer to ensure no data leakage occurs.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
/* u32 is used here for ids for padding reasons.
*/
\end_layout
\begin_layout Plain Layout
struct vring_used_elem {
\end_layout
\begin_layout Plain Layout
/* Index of start of used descriptor chain.
*/
\end_layout
\begin_layout Plain Layout
u32 id;
\end_layout
\begin_layout Plain Layout
/* Total length of the descriptor chain which was used (written to)
*/
\end_layout
\begin_layout Plain Layout
u32 len;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_used {
\end_layout
\begin_layout Plain Layout
#define VRING_USED_F_NO_NOTIFY 1
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
u16 idx;
\end_layout
\begin_layout Plain Layout
struct vring_used_elem ring[qsz];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Helpers for Managing Virtqueues
\end_layout
\begin_layout Standard
The Linux Kernel Source code contains the definitions above and helper routines
in a more usable form, in include/linux/virtio_ring.h.
This was explicitly licensed by IBM under the (3-clause) BSD license so
that it can be freely used by all other projects, and is reproduced (with
slight variation to remove Linux assumptions) in Appendix A.
\end_layout
\begin_layout Section
Device Operation
\end_layout
\begin_layout Standard
There are two parts to device operation: supplying new buffers to the device,
and processing used buffers from the device.
As an example, the virtio network device has two virtqueues: the transmit
virtqueue and the receive virtqueue.
The driver adds outgoing (read-only) packets are added to the transmit
virtqueue, and then frees them after they are used.
Similarly, incoming (write-only) buffers are added to the receive virtqueue,
and processed after they are used.
\end_layout
\begin_layout Subsection
Supplying Buffers to The Device
\end_layout
\begin_layout Standard
Actual transfer of buffers from the guest OS to the device operates as follows:
\end_layout
\begin_layout Enumerate
Place the buffer(s) into free descriptor(s).
\end_layout
\begin_deeper
\begin_layout Enumerate
If there are no free descriptors, the guest may choose to notify the device
even if notifications are suppressed (to reduce latency).
\begin_inset Foot
status open
\begin_layout Plain Layout
The Linux drivers do this only for read-only buffers: for write-only buffers,
it is assumed that the driver is merely trying to keep the receive buffer
ring full, and no notification of this expected condition is necessary.
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
Place the id of the buffer in the next ring entry of the available ring.
\end_layout
\begin_layout Enumerate
The steps (1) and (2) may be performed repeatedly if batching is possible.
\end_layout
\begin_layout Enumerate
A memory barrier should be executed to ensure the device sees the updated
descriptor table and available ring before the next step.
\end_layout
\begin_layout Enumerate
The available
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field should be increased by the number of entries added to the available
ring.
\end_layout
\begin_layout Enumerate
A memory barrier should be executed to ensure that we update the idx field
before checking for notification suppression.
\end_layout
\begin_layout Enumerate
If notifications are not suppressed, the device should be notified of the
new buffers.
\end_layout
\begin_layout Standard
Note that the above code does not take precautions against the available
ring buffer wrapping around: this is not possible since the ring buffer
is the same size as the descriptor table, so step (1) will prevent such
a condition.
\end_layout
\begin_layout Standard
In addition, the maximum queue size is 32768 (it must be a power of 2 which
fits in 16 bits), so the 16-bit
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
value can always distinguish between a full and empty buffer.
\end_layout
\begin_layout Standard
Here is a description of each stage in more detail.
\end_layout
\begin_layout Subsubsection
Placing Buffers Into The Descriptor Table
\end_layout
\begin_layout Standard
A buffer consists of zero or more read-only physically-contiguous elements
followed by zero or more physically-contiguous write-only elements (it
must have at least one element).
This algorithm maps it into the descriptor table:
\end_layout
\begin_layout Enumerate
for each buffer element,
\family typewriter
b
\family default
:
\end_layout
\begin_deeper
\begin_layout Enumerate
Get the next free descriptor table entry,
\family typewriter
d
\end_layout
\begin_layout Enumerate
Set
\family typewriter
d.addr
\family default
to the physical address of the start of
\family typewriter
b
\end_layout
\begin_layout Enumerate
Set
\family typewriter
d.len
\family default
to the length of
\family typewriter
b
\family default
.
\end_layout
\begin_layout Enumerate
If
\family typewriter
b
\family default
is write-only, set
\family typewriter
d.flags
\family default
to VRING_DESC_F_WRITE, otherwise 0.
\end_layout
\begin_layout Enumerate
If there is a buffer element after this:
\end_layout
\begin_deeper
\begin_layout Enumerate
Set
\family typewriter
d.next
\family default
to the index of the next free descriptor element.
\end_layout
\begin_layout Enumerate
Set the VRING_DESC_F_NEXT bit in
\family typewriter
d.flags
\family default
.
\end_layout
\end_deeper
\end_deeper
\begin_layout Standard
In practice, the d.next fields are usually used to chain free descriptors,
and a separate count kept to check there are enough free descriptors before
beginning the mappings.
\end_layout
\begin_layout Subsubsection
Updating The Available Ring
\end_layout
\begin_layout Standard
The head of the buffer we mapped is the first
\family typewriter
d
\family default
in the algorithm above.
A naive implementation would do the following:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->ring[avail->idx % qsz] = head;
\end_layout
\end_inset
\end_layout
\begin_layout Standard
However, in general we can add many descriptors before we update the
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field (at which point they become visible to the device), so we keep a
counter of how many we've added:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->ring[(avail->idx + added++) % qsz] = head;
\end_layout
\end_inset
\end_layout
\begin_layout Subsubsection
Updating The Index Field
\end_layout
\begin_layout Standard
Once the idx field of the virtqueue is updated, the device will be able
to access the descriptor entries we've created and the memory they refer
to.
This is why a memory barrier is generally used before the idx update, to
ensure it sees the most up-to-date copy.
\end_layout
\begin_layout Standard
The idx field always increments, and we let it wrap naturally at 65536:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->idx += added;
\end_layout
\end_inset
\end_layout
\begin_layout Subsubsection
\begin_inset CommandInset label
LatexCommand label
name "sub:Notifying-The-Device"
\end_inset
Notifying The Device
\end_layout
\begin_layout Standard
Device notification occurs by writing the 16-bit virtqueue index of this
virtqueue to the Queue Notify field of the virtio header in the first I/O
region of the PCI device.
This can be expensive, however, so the device can suppress such notifications
if it doesn't need them.
We have to be careful to expose the new idx value
\emph on
before
\emph default
checking the suppression flag: it's OK to notify gratuitously, but not
to skip a notification.
So again, we use a memory barrier here before reading the flags.
\end_layout
\begin_layout Standard
If the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to the
PCI configuration space.
\end_layout
\begin_layout Subsection
\begin_inset CommandInset label
LatexCommand label
name "sub:Receiving-Used-Buffers"
\end_inset
Receiving Used Buffers From The Device
\end_layout
\begin_layout Standard
Once the device has used a buffer (read from or written to it, or parts
of both, depending on the nature of the virtqueue and the device), it sends
an interrupt, following an algorithm very similar to the algorithm used
for the driver to send the device a buffer:
\end_layout
\begin_layout Enumerate
Write the head descriptor number to the next field in the used ring.
\end_layout
\begin_layout Enumerate
Update the used ring idx.
\end_layout
\begin_layout Enumerate
If the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail->flags:
\end_layout
\begin_deeper
\begin_layout Enumerate
If MSI-X capability is disabled:
\end_layout
\begin_deeper
\begin_layout Enumerate
Set the lower bit of the ISR Status field for the device.
\end_layout
\begin_layout Enumerate
Send the appropriate PCI interrupt for the device.
\end_layout
\end_deeper
\begin_layout Enumerate
If MSI-X capability is enabled:
\end_layout
\begin_deeper
\begin_layout Enumerate
Request the appropriate MSI-X interrupt message for the device, Queue Vector
field sets the MSI-X Table entry number.
\end_layout
\begin_layout Enumerate
If Queue Vector field value is NO_VECTOR, no interrupt message is requested
for this event.
\end_layout
\end_deeper
\end_deeper
\begin_layout Standard
The guest interrupt handler should:
\end_layout
\begin_layout Enumerate
If MSI-X capability is disabled: read the ISR Status field, which will reset
it to zero.
If the lower bit is zero, the interrupt was not for this device.
Otherwise, the guest driver should look through the used rings of each
virtqueue for the device, to see if any progress has been made by the device
which requires servicing.
\end_layout
\begin_layout Enumerate
If MSI-X capability is enabled: look through the used rings of each virtqueue
mapped to the specific MSI-X vector for the device, to see if any progress
has been made by the device which requires servicing.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
while (vq->last_seen_used != vring->used.idx) {
\end_layout
\begin_layout Plain Layout
struct vring_used_elem *e = vring.used->ring[vq->last_seen_used%vsz];
\end_layout
\begin_layout Plain Layout
process_buffer(e);
\end_layout
\begin_layout Plain Layout
vq->last_seen_used++;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Dealing With Configuration Changes
\end_layout
\begin_layout Standard
Some virtio PCI devices can change the device configuration state, as reflected
in the virtio header in the PCI configuration space.
In this case:
\end_layout
\begin_layout Enumerate
If MSI-X capability is disabled: an interrupt is delivered and the second
highest bit is set in the ISR Status field to indicate that the driver
should re-examine the configuration space.Note that a single interrupt can
indicate both that one or more virtqueue has been used and that the configurati
on space has changed: even if the config bit is set, virtqueues must be
scanned.
\end_layout
\begin_layout Enumerate
If MSI-X capability is enabled: an interrupt message is requested.
The Configuration Vector field sets the MSI-X Table entry number to use.
If Configuration Vector field value is NO_VECTOR, no interrupt message
is requested for this event.
\end_layout
\begin_layout Chapter
Creating New Device Types
\end_layout
\begin_layout Standard
Various considerations are necessary when creating a new device type:
\end_layout
\begin_layout Section*
How Many Virtqueues?
\end_layout
\begin_layout Standard
It is possible that a very simple device will operate entirely through its
configuration space, but most will need at least one virtqueue in which
it will place requests.
A device with both input and output (eg.
console and network devices described here) need two queues: one which
the driver fills with buffers to receive input, and one which the driver
places buffers to transmit output.
\end_layout
\begin_layout Section*
What Configuration Space Layout?
\end_layout
\begin_layout Standard
Configuration space is generally used for rarely-changing or initialization-time
parameters.
But it is a limited resource, so it might be better to use a virtqueue
to update configuration information (the network device does this for filtering
, otherwise the table in the config space could potentially be very large).
\end_layout
\begin_layout Standard
Note that this space is generally the guest's native endian, rather than
PCI's little-endian.
\end_layout
\begin_layout Section*
What Device Number?
\end_layout
\begin_layout Standard
Currently device numbers are assigned quite freely: a simple request mail
to the author of this document or the Linux virtualization mailing list
\begin_inset Foot
status open
\begin_layout Plain Layout
https://lists.linux-foundation.org/mailman/listinfo/virtualization
\end_layout
\end_inset
will be sufficient to secure a unique one.
\end_layout
\begin_layout Standard
Meanwhile for experimental drivers, use 65535 and work backwards.
\end_layout
\begin_layout Section*
How many MSI-X vectors?
\end_layout
\begin_layout Standard
Using the optional MSI-X capability devices can speed up interrupt processing
by removing the need to read ISR Status register by guest driver (which
might be an expensive operation), reducing interrupt sharing between devices
and queues within the device, and handling interrupts from multiple CPUs.
However, some systems impose a limit (which might be as low as 256) on
the total number of MSI-X vectors that can be allocated to all devices.
Devices and/or device drivers should take this into account, limiting the
number of vectors used unless the device is expected to cause a high volume
of interrupts.
Devices can control the number of vectors used by limiting the MSI-X Table
Size or not presenting MSI-X capability in PCI configuration space.
Drivers can control this by mapping events to as small number of vectors
as possible, or disabling MSI-X capability altogether.
\end_layout
\begin_layout Section*
Message Framing
\end_layout
\begin_layout Standard
The descriptors used for a buffer should not effect the semantics of the
message, except for the total length of the buffer.
For example, a network buffer consists of a 10 byte header followed by
the network packet.
Whether this is presented in the ring descriptor chain as (say) a 10 byte
buffer and a 1514 byte buffer, or a single 1524 byte buffer, or even three
buffers, should have no effect.
\end_layout
\begin_layout Standard
In particular, no implementation should use the descriptor boundaries to
determine the size of any header in a request.
\begin_inset Foot
status open
\begin_layout Plain Layout
The current qemu device implementations mistakenly insist that the first
descriptor cover the header in these cases exactly, so a cautious driver
should arrange it so.
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Improvements
\end_layout
\begin_layout Standard
Any change to configuration space, or new virtqueues, or behavioural changes,
should be indicated be negotiation of a new feature bit.
This establishes clarity
\begin_inset Foot
status open
\begin_layout Plain Layout
Even if it does mean documenting design or implementation mistakes!
\end_layout
\end_inset
and avoids future expansion problems.
\end_layout
\begin_layout Standard
Clusters of functionality which are always implemented together can use
a single bit, but if one feature makes sense without the others they should
not be gratuitously grouped together to conserve feature bits.
We can always extend the spec when the first person needs more than 24
feature bits for their device.
\end_layout
\begin_layout Standard
\begin_inset CommandInset nomencl_print
LatexCommand printnomenclature
\end_inset
\end_layout
\begin_layout Chapter*
Appendix A: virtio_ring.h
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#ifndef VIRTIO_RING_H
\end_layout
\begin_layout Plain Layout
#define VIRTIO_RING_H
\end_layout
\begin_layout Plain Layout
/* An interface for efficient virtio implementation.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* This header is BSD licensed so anyone can use the definitions
\end_layout
\begin_layout Plain Layout
* to implement compatible drivers/servers.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* Copyright 2007, 2009, IBM Corporation
\end_layout
\begin_layout Plain Layout
* All rights reserved.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* Redistribution and use in source and binary forms, with or without
\end_layout
\begin_layout Plain Layout
* modification, are permitted provided that the following conditions
\end_layout
\begin_layout Plain Layout
* are met:
\end_layout
\begin_layout Plain Layout
* 1.
Redistributions of source code must retain the above copyright
\end_layout
\begin_layout Plain Layout
* notice, this list of conditions and the following disclaimer.
\end_layout
\begin_layout Plain Layout
* 2.
Redistributions in binary form must reproduce the above copyright
\end_layout
\begin_layout Plain Layout
* notice, this list of conditions and the following disclaimer in the
\end_layout
\begin_layout Plain Layout
* documentation and/or other materials provided with the distribution.
\end_layout
\begin_layout Plain Layout
* 3.
Neither the name of IBM nor the names of its contributors
\end_layout
\begin_layout Plain Layout
* may be used to endorse or promote products derived from this software
\end_layout
\begin_layout Plain Layout
* without specific prior written permission.
\end_layout
\begin_layout Plain Layout
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
IS'' AND
\end_layout
\begin_layout Plain Layout
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
\end_layout
\begin_layout Plain Layout
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
\end_layout
\begin_layout Plain Layout
* ARE DISCLAIMED.
IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
\end_layout
\begin_layout Plain Layout
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
\end_layout
\begin_layout Plain Layout
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
\end_layout
\begin_layout Plain Layout
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
\end_layout
\begin_layout Plain Layout
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
\end_layout
\begin_layout Plain Layout
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
WAY
\end_layout
\begin_layout Plain Layout
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
\end_layout
\begin_layout Plain Layout
* SUCH DAMAGE.
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as continuing via the next field.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_NEXT 1
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as write-only (otherwise read-only).
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_WRITE 2
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* The Host uses this in used->flags to advise the Guest: don't kick me
\end_layout
\begin_layout Plain Layout
* when you add a buffer.
It's unreliable, so it's simply an
\end_layout
\begin_layout Plain Layout
* optimization.
Guest will still kick if it's out of buffers.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_USED_F_NO_NOTIFY 1
\end_layout
\begin_layout Plain Layout
/* The Guest uses this in avail->flags to advise the Host: don't
\end_layout
\begin_layout Plain Layout
* interrupt me when you consume a buffer.
It's unreliable, so it's
\end_layout
\begin_layout Plain Layout
* simply an optimization.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_AVAIL_F_NO_INTERRUPT 1
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* Virtio ring descriptors: 16 bytes.
\end_layout
\begin_layout Plain Layout
* These can chain together via "next".
*/
\end_layout
\begin_layout Plain Layout
struct vring_desc {
\end_layout
\begin_layout Plain Layout
/* Address (guest-physical).
*/
\end_layout
\begin_layout Plain Layout
uint64_t addr;
\end_layout
\begin_layout Plain Layout
/* Length.
*/
\end_layout
\begin_layout Plain Layout
uint32_t len;
\end_layout
\begin_layout Plain Layout
/* The flags as indicated above.
*/
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
/* We chain unused descriptors via this, too */
\end_layout
\begin_layout Plain Layout
uint16_t next;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_avail {
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
uint16_t idx;
\end_layout
\begin_layout Plain Layout
uint16_t ring[];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* u32 is used here for ids for padding reasons.
*/
\end_layout
\begin_layout Plain Layout
struct vring_used_elem {
\end_layout
\begin_layout Plain Layout
/* Index of start of used descriptor chain.
*/
\end_layout
\begin_layout Plain Layout
uint32_t id;
\end_layout
\begin_layout Plain Layout
/* Total length of the descriptor chain which was written to.
*/
\end_layout
\begin_layout Plain Layout
uint32_t len;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_used {
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
uint16_t idx;
\end_layout
\begin_layout Plain Layout
struct vring_used_elem ring[];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring {
\end_layout
\begin_layout Plain Layout
unsigned int num;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_desc *desc;
\end_layout
\begin_layout Plain Layout
struct vring_avail *avail;
\end_layout
\begin_layout Plain Layout
struct vring_used *used;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* The standard layout for the ring is a continuous chunk of memory which
\end_layout
\begin_layout Plain Layout
* looks like this.
We assume num is a power of 2.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* struct vring {
\end_layout
\begin_layout Plain Layout
* // The actual descriptors (16 bytes each)
\end_layout
\begin_layout Plain Layout
* struct vring_desc desc[num];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // A ring of available descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
* __u16 avail_flags;
\end_layout
\begin_layout Plain Layout
* __u16 avail_idx;
\end_layout
\begin_layout Plain Layout
* __u16 available[num];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // Padding to the next align boundary.
\end_layout
\begin_layout Plain Layout
* char pad[];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // A ring of used descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
* __u16 used_flags;
\end_layout
\begin_layout Plain Layout
* __u16 used_idx;
\end_layout
\begin_layout Plain Layout
* struct vring_used_elem used[num];
\end_layout
\begin_layout Plain Layout
* };
\end_layout
\begin_layout Plain Layout
* Note: for virtio PCI, align is 4096.
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
static inline void vring_init(struct vring *vr, unsigned int num, void *p,
\end_layout
\begin_layout Plain Layout
unsigned long align)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
vr->num = num;
\end_layout
\begin_layout Plain Layout
vr->desc = p;
\end_layout
\begin_layout Plain Layout
vr->avail = p + num*sizeof(struct vring_desc);
\end_layout
\begin_layout Plain Layout
vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
\end_layout
\begin_layout Plain Layout
+ align-1)
\end_layout
\begin_layout Plain Layout
& ~(align - 1));
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
static inline unsigned vring_size(unsigned int num, unsigned long align)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
return ((sizeof(struct vring_desc)*num + sizeof(uint16_t)*(2+num)
\end_layout
\begin_layout Plain Layout
+ align - 1) & ~(align - 1))
\end_layout
\begin_layout Plain Layout
+ sizeof(uint16_t)*2 + sizeof(struct vring_used_elem)*num;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
#endif /* VIRTIO_RING_H */
\end_layout
\end_inset
\end_layout
\begin_layout Chapter*
\begin_inset CommandInset label
LatexCommand label
name "cha:Reserved-Feature-Bits"
\end_inset
Appendix B: Reserved Feature Bits
\end_layout
\begin_layout Standard
Currently there are three device-independent feature bits defined:
\end_layout
\begin_layout Description
VIRTIO_F_NOTIFY_ON_EMPTY
\begin_inset space ~
\end_inset
(24) Negotiating this feature indicates that the driver wants an interrupt
if the device runs out of available descriptors on a virtqueue, even though
interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT flag.
An example of this is the networking driver: it doesn't need to know every
time a packet is transmitted, but it does need to free the transmitted
packets a finite time after they are transmitted.
It can avoid using a timer if the device interrupts it when all the packets
are transmitted.
\end_layout
\begin_layout Description
VIRTIO_F_RING_INDIRECT_DESC
\begin_inset space ~
\end_inset
(28) Negotiating this feature indicates that the driver can use descriptors
with the VRING_DESC_F_INDIRECT flag set, as described in
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Indirect-Descriptors"
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_F_BAD_FEATURE(30) This feature should never be negotiated by the
guest; doing so is an indication that the guest is faulty
\begin_inset Foot
status open
\begin_layout Plain Layout
An experimental virtio PCI driver contained in Linux version 2.6.25 had this
problem, and this feature bit can be used to detect it.
\end_layout
\end_inset
\end_layout
\begin_layout Chapter*
Appendix C: Network Device
\end_layout
\begin_layout Standard
The virtio network device is a virtual ethernet card, and is the most complex
of the devices supported so far by virtio.
It has enhanced rapidly and demonstrates clearly how support for new features
should be added to an existing device.
Empty buffers are placed in one virtqueue for receiving packets, and outgoing
packets are enqueued into another for transmission in that order.
A third command queue is used to control advanced filtering features.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 1
\end_layout
\begin_layout Description
Virtqueues 0:receiveq.
1:transmitq.
2:controlq
\begin_inset Foot
status open
\begin_layout Plain Layout
Only if VIRTIO_NET_F_CTRL_VQ set
\end_layout
\end_inset
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_NET_F_CSUM
\begin_inset space ~
\end_inset
(0) Device handles packets with partial checksum
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_CSUM
\begin_inset space ~
\end_inset
(1) Guest handles packets with partial checksum
\end_layout
\begin_layout Description
VIRTIO_NET_F_MAC
\begin_inset space ~
\end_inset
(5) Device has given MAC address.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GSO
\begin_inset space ~
\end_inset
(6) (Deprecated) device handles packets with any GSO type.
\begin_inset Foot
status open
\begin_layout Plain Layout
It was supposed to indicate segmentation offload support, but upon further
investigation it became clear that multiple bits were required.
\end_layout
\end_inset
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_TSO4
\begin_inset space ~
\end_inset
(7) Guest can receive TSOv4.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_TSO6
\begin_inset space ~
\end_inset
(8) Guest can receive TSOv6.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_ECN
\begin_inset space ~
\end_inset
(9) Guest can receive TSO with ECN.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_UFO
\begin_inset space ~
\end_inset
(10) Guest can receive UFO.
\end_layout
\begin_layout Description
VIRTIO_NET_F_HOST_TSO4
\begin_inset space ~
\end_inset
(11) Device can receive TSOv4.
\end_layout
\begin_layout Description
VIRTIO_NET_F_HOST_TSO6
\begin_inset space ~
\end_inset
(12) Device can receive TSOv6.
\end_layout
\begin_layout Description
VIRTIO_NET_F_HOST_ECN
\begin_inset space ~
\end_inset
(13) Device can receive TSO with ECN.
\end_layout
\begin_layout Description
VIRTIO_NET_F_HOST_UFO
\begin_inset space ~
\end_inset
(14) Device can receive UFO.
\end_layout
\begin_layout Description
VIRTIO_NET_F_MRG_RXBUF
\begin_inset space ~
\end_inset
(15) Guest can merge receive buffers.
\end_layout
\begin_layout Description
VIRTIO_NET_F_STATUS
\begin_inset space ~
\end_inset
(16) Configuration status field is available.
\end_layout
\begin_layout Description
VIRTIO_NET_F_CTRL_VQ
\begin_inset space ~
\end_inset
(17) Control channel is available.
\end_layout
\begin_layout Description
VIRTIO_NET_F_CTRL_RX
\begin_inset space ~
\end_inset
(18) Control channel RX mode support.
\end_layout
\begin_layout Description
VIRTIO_NET_F_CTRL_VLAN
\begin_inset space ~
\end_inset
(19) Control channel VLAN filtering.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout Two configuration fields are currently defined.
The mac address field always exists (though is only valid if VIRTIO_NET_F_MAC
is set), and the status field only exists if VIRTIO_NET_F_STATUS is set.
Only one bit is currently defined for the status field: VIRTIO_NET_S_LINK_UP.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_NET_S_LINK_UP 1
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct virtio_net_config {
\end_layout
\begin_layout Plain Layout
u8 mac[6];
\end_layout
\begin_layout Plain Layout
u16 status;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The initialization routine should identify the receive and transmission
virtqueues.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_MAC feature bit is set, the configuration space
\begin_inset Quotes eld
\end_inset
mac
\begin_inset Quotes erd
\end_inset
entry indicates the
\begin_inset Quotes eld
\end_inset
physical
\begin_inset Quotes erd
\end_inset
address of the the network card, otherwise a private MAC address should
be assigned.
All guests are expected to negotiate this feature if it is set.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify the control
virtqueue.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link status can
be read from the bottom bit of the
\begin_inset Quotes eld
\end_inset
status
\begin_inset Quotes erd
\end_inset
config field.
Otherwise, the link should be assumed active.
\end_layout
\begin_layout Enumerate
The receive virtqueue should be filled with receive buffers.
This is described in detail below in
\begin_inset Quotes eld
\end_inset
Setting Up Receive Buffers
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Enumerate
A driver can indicate that it will generate checksumless packets by negotating
the VIRTIO_NET_F_CSUM feature.
This
\begin_inset Quotes eld
\end_inset
checksum offload
\begin_inset Quotes erd
\end_inset
is a common feature on modern network cards.
\end_layout
\begin_layout Enumerate
If that feature is negotiated, a driver can use TCP or UDP segmentation
offload by negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP), VIRTIO_NET_F_HOST
_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO (UDP fragmentation) features.
It should not send TCP packets requiring segmentation offload which have
the Explicit Congestion Notification bit set, unless the VIRTIO_NET_F_HOST_ECN
feature is negotiated.
\begin_inset Foot
status open
\begin_layout Plain Layout
This is a common restriction in real, older network cards.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
The converse features are also available: a driver can save the virtual
device some work by negotiating these features.
\begin_inset Foot
status open
\begin_layout Plain Layout
For example, a network packet transported between two guests on the same
system may not require checksumming at all, nor segmentation, if both guests
are amenable.
\end_layout
\end_inset
The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially checksummed
packets can be received, and if it can do that then the VIRTIO_NET_F_GUEST_TSO4
, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN
are the input equivalents of the features described above.
See
\begin_inset Quotes eld
\end_inset
Receiving Packets
\begin_inset Quotes erd
\end_inset
below.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Standard
Packets are transmitted by placing them in the transmitq, and buffers for
incoming packets are placed in the receiveq.
In each case, the packet itself is preceeded by a header:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_net_hdr {
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1
\end_layout
\begin_layout Plain Layout
u8 flags;
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_NONE 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_TCPV4 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_UDP 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_TCPV6 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_HDR_GSO_ECN 0x80
\end_layout
\begin_layout Plain Layout
u8 gso_type;
\end_layout
\begin_layout Plain Layout
u16 hdr_len;
\end_layout
\begin_layout Plain Layout
u16 gso_size;
\end_layout
\begin_layout Plain Layout
u16 csum_start;
\end_layout
\begin_layout Plain Layout
u16 csum_offset;
\end_layout
\begin_layout Plain Layout
/* Only if VIRTIO_NET_F_MRG_RXBUF: */
\end_layout
\begin_layout Plain Layout
u16 num_buffers
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The controlq is used to control device features such as filtering.
\end_layout
\begin_layout Subsection*
Packet Transmission
\end_layout
\begin_layout Standard
Transmitting a single packet is simple, but varies depending on the different
features the driver negotiated.
\end_layout
\begin_layout Enumerate
If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has not been
fully checksummed, then the virtio_net_hdr's fields are set as follows.
Otherwise, the packet must be fully checksummed, and flags is zero.
\end_layout
\begin_deeper
\begin_layout Itemize
flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
\end_layout
\begin_layout Itemize
\begin_inset CommandInset label
LatexCommand label
name "ite:csum_start-is-set"
\end_inset
csum_start is set to the offset within the packet to begin checksumming,
and
\end_layout
\begin_layout Itemize
csum_offset indicates how many bytes after the csum_start the new (16 bit
ones' complement) checksum should be placed.
\begin_inset Foot
status open
\begin_layout Plain Layout
For example, consider a partially checksummed TCP (IPv4) packet.
It will have a 14 byte ethernet header and 20 byte IP header followed by
the TCP header (with the TCP checksum field 16 bytes into that header).
csum_start will be 14+20 = 34 (the TCP checksum includes the header), and
csum_offset will be 16.
The value in the TCP checksum field will be the sum of the TCP pseudo header,
so that replacing it by the ones' complement checksum of the TCP header
and body will give the correct result.
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
\begin_inset CommandInset label
LatexCommand label
name "enu:If-the-driver"
\end_inset
If the driver negotiated VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet
requires TCP segmentation or UDP fragmentation, then the
\begin_inset Quotes eld
\end_inset
gso_type
\begin_inset Quotes erd
\end_inset
field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP.
(Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE).
In this case, packets larger than 1514 bytes can be transmitted: the metadata
indicates how to replicate the packet header to cut it into smaller packets.
The other gso fields are set:
\end_layout
\begin_deeper
\begin_layout Itemize
hdr_len is a hint to the device as to how much of the header needs to be
kept to copy into each packet, usually set to the length of the headers,
including the transport header.
\begin_inset Foot
status open
\begin_layout Plain Layout
Due to various bugs in implementations, this field is not useful as a guarantee
of the transport header size.
\end_layout
\end_inset
\end_layout
\begin_layout Itemize
gso_size is the size of the packet beyond that header (ie.
MSS).
\end_layout
\begin_layout Itemize
If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the VIRTIO_NET_HDR_G
SO_ECN bit may be set in
\begin_inset Quotes eld
\end_inset
gso_type
\begin_inset Quotes erd
\end_inset
as well, indicating that the TCP packet has the ECN bit set.
\begin_inset Foot
status open
\begin_layout Plain Layout
This case is not handled by some older hardware, so is called out specifically
in the protocol.
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, the num_buffers
field is set to zero.
\end_layout
\begin_layout Enumerate
The header and packet are added as one output buffer to the transmitq, and
the device is notified of the new entry (see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Notifying-The-Device"
\end_inset
).
\begin_inset Foot
status open
\begin_layout Plain Layout
Note that the header will be two bytes longer for the VIRTIO_NET_F_MRG_RXBUF
case.
\end_layout
\end_inset
\end_layout
\begin_layout Subsection*
Packet Transmission Interrupt
\end_layout
\begin_layout Standard
Often a driver will suppress transmission interrupts using the VRING_AVAIL_F_NO_
INTERRUPT flag (see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Receiving-Used-Buffers"
\end_inset
) and check for used packets in the transmit path of following packets.
However, it will still receive interrupts if the VIRTIO_F_NOTIFY_ON_EMPTY
feature is negotiated, indicating that the transmission queue is completely
emptied.
\end_layout
\begin_layout Standard
The normal behavior in this interrupt handler is to retrieve and new descriptors
from the used ring and free the corresponding headers and packets.
\end_layout
\begin_layout Subsection*
Setting Up Receive Buffers
\end_layout
\begin_layout Standard
It is generally a good idea to keep the receive virtqueue as fully populated
as possible: if it runs out, network performance will suffer.
\end_layout
\begin_layout Standard
If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or VIRTIO_NET_F_GUEST_UF
O features are used, the Guest will need to accept packets of up to 65550
bytes long (the maximum size of a TCP or UDP packet, plus the 14 byte ethernet
header), otherwise 1514 bytes.
So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every buffer in the receive
queue needs to be at least this length
\begin_inset Foot
status open
\begin_layout Plain Layout
Obviously each one can be split across multiple descriptor elements.
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at least the
size of the
\family typewriter
struct virtio_net_hdr
\family default
.
\end_layout
\begin_layout Subsection*
Packet Receive Interrupt
\end_layout
\begin_layout Standard
When a packet is copied into a buffer in the receiveq, the optimal path
is to disable further interrupts for the receiveq (see
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Receiving-Used-Buffers"
\end_inset
) and process packets until no more are found, then re-enable them.
\end_layout
\begin_layout Standard
Processing packet involves:
\end_layout
\begin_layout Enumerate
If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, then the
\begin_inset Quotes eld
\end_inset
num_buffers
\begin_inset Quotes erd
\end_inset
field indicates how many descriptors this packet is spread over (including
this one).
This allows receipt of large packets without having to allocate large buffers.
In this case, there will be at least
\begin_inset Quotes eld
\end_inset
num_buffers
\begin_inset Quotes erd
\end_inset
in the used ring, and they should be chained together to form a single
packet.
The other buffers will
\emph on
not
\emph default
begin with a
\family typewriter
struct virtio_net_hdr
\family default
.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or the
\begin_inset Quotes eld
\end_inset
num_buffers
\begin_inset Quotes erd
\end_inset
field is one, then the entire packet will be contained within this buffer,
immediately following the
\family typewriter
struct virtio_net_hdr
\family default
.
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEED
S_CSUM bit in the
\begin_inset Quotes eld
\end_inset
flags
\begin_inset Quotes erd
\end_inset
field may be set: if so, the checksum on the packet is incomplete and the
\begin_inset Quotes eld
\end_inset
csum_start
\begin_inset Quotes erd
\end_inset
and
\begin_inset Quotes eld
\end_inset
csum_offset
\begin_inset Quotes erd
\end_inset
fields indicate how to calculate it (see
\begin_inset CommandInset ref
LatexCommand ref
reference "ite:csum_start-is-set"
\end_inset
).
\end_layout
\begin_layout Enumerate
If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then
the
\begin_inset Quotes eld
\end_inset
gso_type
\begin_inset Quotes erd
\end_inset
may be something other than VIRTIO_NET_HDR_GSO_NONE, and the
\begin_inset Quotes eld
\end_inset
gso_size
\begin_inset Quotes erd
\end_inset
field indicates the desired MSS (see
\begin_inset CommandInset ref
LatexCommand ref
reference "enu:If-the-driver"
\end_inset
).Control Virtqueue
\end_layout
\begin_layout Standard
The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is negotiated)
to send commands to manipulate various features of the device which would
not easily map into the configuration space.
\end_layout
\begin_layout Standard
All commands are of the following form:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_net_ctrl {
\end_layout
\begin_layout Plain Layout
u8 class;
\end_layout
\begin_layout Plain Layout
u8 command;
\end_layout
\begin_layout Plain Layout
u8 command-specific-data[];
\end_layout
\begin_layout Plain Layout
u8 ack;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* ack values */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_OK 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_ERR 1
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The class, command and command-specific-data are set by the driver, and
the device sets the ack byte.
There is little it can do except issue a diagnostic if the ack byte is
not VIRTIO_NET_OK.
\end_layout
\begin_layout Subsection*
Packet Receive Filtering
\end_layout
\begin_layout Standard
If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can send control
commands for promiscuous mode, multicast receiving, and filtering of MAC
addresses.
\end_layout
\begin_layout Standard
Note that in general, these commands are best-effort: unwanted packets may
still arrive.
\end_layout
\begin_layout Subsubsection*
Setting Promiscuous Mode
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_RX 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_RX_PROMISC 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_RX_ALLMULTI 1
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The class VIRTIO_NET_CTRL_RX has two commands: VIRTIO_NET_CTRL_RX_PROMISC
turns promiscuous mode on and off, and VIRTIO_NET_CTRL_RX_ALLMULTI turns
all-multicast receive on and off.
The command-specific-data is one byte containing 0 (off) or 1 (on).
\end_layout
\begin_layout Subsubsection*
Setting MAC Address Filtering
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_net_ctrl_mac {
\end_layout
\begin_layout Plain Layout
u32 entries;
\end_layout
\begin_layout Plain Layout
u8 macs[entries][ETH_ALEN];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_MAC 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_MAC_TABLE_SET 0
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The device can filter incoming packets by any number of destination MAC
addresses.
\begin_inset Foot
status open
\begin_layout Plain Layout
Since there are no guarentees, it can use a hash filter orsilently switch
to allmulti or promiscuous mode if it is given too many addresses.
\end_layout
\end_inset
This table is set using the class VIRTIO_NET_CTRL_MAC and the command VIRTIO_NE
T_CTRL_MAC_TABLE_SET.
The command-specific-data is two variable length tables of 6-byte MAC addresses.
The first table contains unicast addresses, and the second contains multicast
addresses.
\end_layout
\begin_layout Subsection*
VLAN Filtering
\end_layout
\begin_layout Standard
If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it can control
a VLAN filter table in the device.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_VLAN 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_VLAN_ADD 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_NET_CTRL_VLAN_DEL 1
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL command take
a 16-bit VLAN id as the command-specific-data.
\end_layout
\begin_layout Chapter*
Appendix D: Block Device
\end_layout
\begin_layout Standard
The virtio block device is a simple virtual block device (ie.
disk).
Read and write requests (and other exotic requests) are placed in the queue,
and serviced (probably out of order) by the device except where noted.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 2
\end_layout
\begin_layout Description
Virtqueues 0:requestq.
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_BLK_F_BARRIER
\begin_inset space ~
\end_inset
(0) Host supports request barriers.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_SIZE_MAX
\begin_inset space ~
\end_inset
(1) Maximum size of any single segment is in
\begin_inset Quotes eld
\end_inset
size_max
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_SEG_MAX
\begin_inset space ~
\end_inset
(2) Maximum number of segments in a request is in
\begin_inset Quotes eld
\end_inset
seg_max
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_GEOMETRY
\begin_inset space ~
\end_inset
(4) Disk-style geometry specified in
\begin_inset Quotes eld
\end_inset
geometry
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_RO
\begin_inset space \space{}
\end_inset
(5) Device is read-only.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_BLK_SIZE
\begin_inset space ~
\end_inset
(6) Block size of disk is in
\begin_inset Quotes eld
\end_inset
blk_size
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_BLK_F_SECTOR_MAX
\begin_inset space ~
\end_inset
(10) Maximum total sectors in an I/O.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout The capacity of the device (expressed in 512-byte sectors) is always
present.
The availability of the others all depend on various feature bits as indicated
above.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_blk_config {
\end_layout
\begin_layout Plain Layout
u64 capacity;
\end_layout
\begin_layout Plain Layout
u32 size_max;
\end_layout
\begin_layout Plain Layout
u32 seg_max;
\end_layout
\begin_layout Plain Layout
struct virtio_blk_geometry {
\end_layout
\begin_layout Plain Layout
u16 cylinders;
\end_layout
\begin_layout Plain Layout
u8 heads;
\end_layout
\begin_layout Plain Layout
u8 sectors;
\end_layout
\begin_layout Plain Layout
} geometry;
\end_layout
\begin_layout Plain Layout
u32 blk_size;
\end_layout
\begin_layout Plain Layout
u32 sectors_max;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The device size should be read from the
\begin_inset Quotes eld
\end_inset
capacity
\begin_inset Quotes erd
\end_inset
configuration field.
No requests should be submitted which goes beyond this limit.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the blk_size field can
be read to determine the optimal sector size for the driver to use.
This does not effect the units used in the protocol (always 512 bytes),
but awareness of the correct value can effect performance.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BLK_F_RO feature is set by the device, any write requests
will fail.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BLK_F_SECTOR_MAX feature is negotiated, the sectors_max field
should be read to determine the maximum I/O size for the driver to use.
No requests should be submitted which go beyond this limit.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Standard
The driver queues requests to the virtqueue, and they are used by the device
(not necessarily in order).
Each request is of form:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_blk_req {
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_IN 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_OUT 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_T_BARRIER 0x80000000
\end_layout
\begin_layout Plain Layout
u32 type;
\end_layout
\begin_layout Plain Layout
u32 ioprio;
\end_layout
\begin_layout Plain Layout
u64 sector;
\end_layout
\begin_layout Plain Layout
char data[][512];
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_S_OK 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BLK_S_IOERR 1
\end_layout
\begin_layout Plain Layout
u8 status;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The type of the request is either a read (VIRTIO_BLK_T_IN) or a write (VIRTIO_BL
K_T_OUT); the high bit indicates that this request acts as a barrier and
that all preceeding requests must be complete before this one, and all
following requests must not be started until this is complete.
\end_layout
\begin_layout Standard
The ioprio field is a hint about the relative priorities of requests to
the device: higher numbers indicate more important requests.
\end_layout
\begin_layout Standard
The sector number indicates the offset (multiplied by 512) where the read
or write is to occur.
\end_layout
\begin_layout Standard
Note that these first three fields are always read-only: the data field
is either read-only or write-only, depending on the type of the request.
The size of the read or write can be derived from the total size of the
request buffer.
\end_layout
\begin_layout Standard
The final byte is written by the device: either VIRTIO_BLK_S_OK or VIRTIO_BLK_S_
IOERR.
\end_layout
\begin_layout Chapter*
Appendix E: Console Device
\end_layout
\begin_layout Standard
The virtio console device is a simple device for data input and output.
A device may have one or more ports.
Each port has a pair of input and output virtqueues.
Moreover, a device has a pair of control IO virtqueues.
The control virtqueues are used to communicate information between the
device and the driver about ports being opened and closed on either side
of the connection, indication from the host about whether a particular
port is a console port, adding new ports, port hot-plug/unplug, etc., and
indication from the guest about whether a port or a device was successfully
added, port open/close, etc..
For data IO, one or more empty buffers are placed in the receive queue
for incoming data and outgoing characters are placed in the transmit queue.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 3
\end_layout
\begin_layout Description
Virtqueues 0:receiveq(port0).
1:transmitq(port0), 2:control receiveq, 3:control transmitq, 4:receiveq(port1),
5:transmitq(port1), ...
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_CONSOLE_F_SIZE
\begin_inset space ~
\end_inset
(0) Configuration cols and rows fields are valid.
\end_layout
\begin_layout Description
VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple ports; configurati
on fields nr_ports and max_nr_ports are valid and control virtqueues will
be used.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout The size of the console is supplied in the configuration space if
the VIRTIO_CONSOLE_F_SIZE feature is set.
Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature is set, the maximum
number of ports supported by the device can be fetched.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_console_config {
\end_layout
\begin_layout Plain Layout
u16 cols;
\end_layout
\begin_layout Plain Layout
u16 rows;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
u32 max_nr_ports;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver can read
the console dimensions from the configuration fields.
\end_layout
\begin_layout Enumerate
If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the driver can
spawn multiple ports, not all of which may be attached to a console.
Some could be generic ports.
In this case, the control virtqueues are enabled and according to the max_nr_po
rts configuration-space value, the appropriate number of virtqueues are
created.
A control message indicating the driver is ready is sent to the host.
The host can then send control messages for adding new ports to the device.
After creating and initializing each port, a VIRTIO_CONSOLE_PORT_READY
control message is sent to the host for that port so the host can let us
know of any additional configuration options set for that port.
\end_layout
\begin_layout Enumerate
The receiveq for each port is populated with one or more receive buffers.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Enumerate
For output, a buffer containing the characters is placed in the port's transmitq.
\begin_inset Foot
status open
\begin_layout Plain Layout
Because this is high importance and low bandwidth, the current Linux implementat
ion polls for the buffer to be used, rather than waiting for an interrupt,
simplifying the implementation significantly.
However, for generic serial ports with the O_NONBLOCK flag set, the polling
limitation is relaxed and the consumed buffers are freed upon the next
write or poll call or when a port is closed or hot-unplugged.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
When a buffer is used in the receiveq (signalled by an interrupt), the contents
is the input to the port associated with the virtqueue for which the notificati
on was received.
\end_layout
\begin_layout Enumerate
If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a configuration
change interrupt may occur.
The updated size can be read from the configuration fields.
\end_layout
\begin_layout Enumerate
If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT feature, active
ports are announced by the host using the VIRTIO_CONSOLE_PORT_ADD control
message.
The same message is used for port hot-plug as well.
\end_layout
\begin_layout Enumerate
If the host specified a port 'name', a sysfs attribute is created with the
name filled in, so that udev rules can be written that can create a symlink
from the port's name to the char device for port discovery by applications
in the guest.
\end_layout
\begin_layout Enumerate
Changes to ports' state are effected by control messages.
Appropriate action is taken on the port indicated in the control message.
The layout of the structure of the control buffer and the events associated
are:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_console_control {
\end_layout
\begin_layout Plain Layout
uint32_t id; /* Port number */
\end_layout
\begin_layout Plain Layout
uint16_t event; /* The kind of control event */
\end_layout
\begin_layout Plain Layout
uint16_t value; /* Extra information for the event */
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* Some events for the internal messages (control packets) */
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_DEVICE_READY 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_ADD 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_REMOVE 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_READY 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_CONSOLE_PORT 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_RESIZE 5
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_OPEN 6
\end_layout
\begin_layout Plain Layout
#define VIRTIO_CONSOLE_PORT_NAME 7
\end_layout
\end_inset
\end_layout
\begin_layout Chapter*
Appendix F: Entropy Device
\end_layout
\begin_layout Standard
The virtio entropy device supplies high-quality randomness for guest use.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 4
\end_layout
\begin_layout Description
Virtqueues 0:requestq.
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits None currently defined
\end_layout
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout None currently defined.
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The virtqueue is initialized
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Standard
When the driver requires random bytes, it places the descriptor of one or
more buffers in the queue.
It will be completely filled by random data by the device.
\end_layout
\begin_layout Chapter*
Appendix G: Memory Balloon Device
\end_layout
\begin_layout Standard
The virtio memory balloon device is a primitive device for managing guest
memory: the device asks for a certain amount of memory, and the guest supplies
it (or withdraws it, if the device has more than it asks for).
This allows the guest to adapt to changes in allowance of underlying physical
memory.
If the feature is negotiated, the device can also be used to communicate
guest memory statistics to the host.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 5
\end_layout
\begin_layout Description
Virtqueues 0:inflateq.
1:deflateq.
2:statsq.
\begin_inset Foot
status open
\begin_layout Plain Layout
Only if VIRTIO_BALLON_F_STATS_VQ set
\end_layout
\end_inset
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_BALLOON_F_MUST_TELL_HOST
\begin_inset space ~
\end_inset
(0) Host must be told before pages from the balloon are used.
\end_layout
\begin_layout Description
VIRTIO_BALLOON_F_STATS_VQ
\begin_inset space \space{}
\end_inset
(1) A virtqueue for reporting guest memory statistics is present.
\end_layout
\end_deeper
\begin_layout Description
Device
\begin_inset space ~
\end_inset
configuration
\begin_inset space ~
\end_inset
layout Both fields of this configuration are always available.
Note that they are little endian, despite convention that device fields
are guest endian:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_console_config {
\end_layout
\begin_layout Plain Layout
u32 num_pages;
\end_layout
\begin_layout Plain Layout
u32 actual;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Initialization
\end_layout
\begin_layout Enumerate
The inflate and deflate virtqueues are identified.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated:
\end_layout
\begin_deeper
\begin_layout Enumerate
Identify the stats virtqueue.
\end_layout
\begin_layout Enumerate
Add one empty buffer to the stats virtqueue and notify the host.
\end_layout
\end_deeper
\begin_layout Standard
Device operation begins immediately.
\end_layout
\begin_layout Section*
Device Operation
\end_layout
\begin_layout Description
Memory
\begin_inset space \space{}
\end_inset
Ballooning The device is driven by the receipt of a configuration change
interrupt.
\end_layout
\begin_layout Enumerate
The
\begin_inset Quotes eld
\end_inset
num_pages
\begin_inset Quotes erd
\end_inset
configuration field is examined.
If this is greater than the
\begin_inset Quotes eld
\end_inset
actual
\begin_inset Quotes erd
\end_inset
number of pages, memory must be given to the balloon.
If it is less than the
\begin_inset Quotes eld
\end_inset
actual
\begin_inset Quotes erd
\end_inset
number of pages, memory may be taken back from the balloon for general
use.
\end_layout
\begin_layout Enumerate
To supply memory to the balloon (aka.
inflate):
\end_layout
\begin_deeper
\begin_layout Enumerate
The driver constructs an array of addresses of unused memory pages.
These addresses are divided by 4096
\begin_inset Foot
status open
\begin_layout Plain Layout
This is historical, and independent of the guest page size
\end_layout
\end_inset
and the descriptor describing the resulting 32-bit array is added to the
inflateq.
\end_layout
\end_deeper
\begin_layout Enumerate
To remove memory from the balloon (aka.
deflate):
\end_layout
\begin_deeper
\begin_layout Enumerate
The driver constructs an array of addresses of memory pages it has previously
given to the balloon, as described above.
This descriptor is added to the deflateq.
\end_layout
\begin_layout Enumerate
If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the guest may not
use these requested pages until that descriptor in the deflateq has been
used by the device.
\end_layout
\begin_layout Enumerate
Otherwise, the guest may begin to re-use pages previously given to the balloon
before the device has acknowledged their withdrawl.
\begin_inset Foot
status open
\begin_layout Plain Layout
In this case, deflation advice is merely a courtesy
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
In either case, once the device has completed the inflation or deflation,
the
\begin_inset Quotes eld
\end_inset
actual
\begin_inset Quotes erd
\end_inset
field of the configuration should be updated to reflect the new number
of pages in the balloon.
\begin_inset Foot
status open
\begin_layout Plain Layout
As updates to configuration space are not atomic, this field isn't particularly
reliable, but can be used to diagnose buggy guests.
\end_layout
\end_inset
\end_layout
\begin_layout Description
Memory
\begin_inset space \space{}
\end_inset
Statistics
\end_layout
\begin_layout Standard
The stats virtqueue is atypical because communication is driven by the device
(not the driver).
The channel becomes active at driver initialization time when the driver
adds an empty buffer and notifies the device.
A request for memory statistics proceeds as follows:
\end_layout
\begin_layout Enumerate
The device pushes the buffer onto the used ring and sends an interrupt.
\end_layout
\begin_layout Enumerate
The driver pops the used buffer and discards it.
\end_layout
\begin_layout Enumerate
The driver collects memory statistics and writes them into a new buffer.
\end_layout
\begin_layout Enumerate
The driver adds the buffer to the virtqueue and notifies the device.
\end_layout
\begin_layout Enumerate
The device pops the buffer (retaining it to initiate a subsequent request)
and consumes the statistics.
\end_layout
\begin_layout Description
Memory
\begin_inset space \space{}
\end_inset
Statistics
\begin_inset space \space{}
\end_inset
Format Each statistic consists of a 16 bit tag and a 64 bit value.
Both quantities are represented in the native endian of the guest.
All statistics are optional and the driver may choose which ones to supply.
To guarantee backwards compatibility, unsupported statistics should be
omitted.
\end_layout
\begin_deeper
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct virtio_balloon_stat {
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_SWAP_IN 0
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_SWAP_OUT 1
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_MAJFLT 2
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_MINFLT 3
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_MEMFREE 4
\end_layout
\begin_layout Plain Layout
#define VIRTIO_BALLOON_S_MEMTOT 5
\end_layout
\begin_layout Plain Layout
u16 tag;
\end_layout
\begin_layout Plain Layout
u64 val;
\end_layout
\begin_layout Plain Layout
} __attribute__((packed));
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Description
Tags
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been swapped in (in
bytes).
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been swapped out
to disk (in bytes).
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_MAJFLT The number of major page faults that have occurred.
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_MINFLT The number of minor page faults that have occurred.
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used for any purpose
(in bytes).
\end_layout
\begin_layout Description
VIRTIO_BALLOON_S_MEMTOT The total amount of memory available (in bytes).
\end_layout
\end_body
\end_document