blob: 6c7bab1ce9cab116165e4b8b1260f1ec8b845c85 [file] [log] [blame]
#LyX 2.0 created this file. For more info see http://www.lyx.org/
\lyxformat 413
\begin_document
\begin_header
\textclass report
\use_default_options false
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman default
\font_sans default
\font_typewriter default
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100
\font_tt_scale 100
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_amsmath 1
\use_esint 1
\use_mhchem 1
\use_mathdots 1
\cite_engine basic
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\use_refstyle 0
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation skip
\defskip medskip
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes true
\output_changes true
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\author -608949062 "Rusty Russell,,,"
\author 1531152142 "pbonzini"
\end_header
\begin_body
\begin_layout Title
Virtio PCI Card Specification
\begin_inset Newline newline
\end_inset
v0.9.
\change_inserted -608949062 1328582029
4
\change_deleted -608949062 1328582028
3
\change_unchanged
DRAFT
\begin_inset Newline newline
\end_inset
-
\end_layout
\begin_layout Author
Rusty Russell <rusty@rustcorp.com.au> IBM Corporation (Editor)
\end_layout
\begin_layout Date
201
\change_inserted -608949062 1328582035
2
\change_deleted -608949062 1328582047
1 December 6
\change_inserted -608949062 1328582050
February 7
\change_unchanged
.
\end_layout
\begin_layout Chapter
Purpose and Description
\end_layout
\begin_layout Standard
This document describes the specifications of the
\begin_inset Quotes eld
\end_inset
virtio
\begin_inset Quotes erd
\end_inset
family of
\emph on
PCI
\emph default
\begin_inset CommandInset nomenclature
LatexCommand nomenclature
symbol "PCI"
description "Peripheral Component Interconnect; a common device bus. See\\\\http://en.wikipedia.org/wiki/Peripheral Component Interconnect"
\end_inset
devices.
These are devices are found in
\emph on
virtual
\emph default
\emph on
environments
\begin_inset CommandInset nomenclature
LatexCommand nomenclature
symbol "virtualized"
description "Environments where access to hardware is restricted (and often emulated) by a hypervisor."
\end_inset
\emph default
, yet by design they are not all that different from physical PCI devices,
and this document treats them as such.
This allows the guest to use standard PCI drivers and discovery mechanisms.
\end_layout
\begin_layout Standard
The purpose of virtio and this specification is that virtual environments
and guests should have a straightforward, efficient, standard and extensible
mechanism for virtual devices, rather than boutique per-environment or
per-OS mechanisms.
\end_layout
\begin_layout Description
Straightforward: Virtio PCI devices use normal PCI mechanisms of interrupts
and DMA which should be familiar to any device driver author.
There is no exotic page-flipping or COW mechanism: it's just a PCI device.
\begin_inset Foot
status open
\begin_layout Plain Layout
This lack of page-sharing implies that the implementation of the device
(e.g.
the hypervisor or host) needs full access to the guest memory.
Communication with untrusted parties (i.e.
inter-guest communication) requires copying.
\end_layout
\end_inset
\end_layout
\begin_layout Description
Efficient: Virtio PCI devices consist of rings of descriptors for input
and output, which are neatly separated to avoid cache effects from both
guest and device writing to the same cache lines.
\end_layout
\begin_layout Description
Standard: Virtio PCI makes no assumptions about the environment in which
it operates, beyond supporting PCI.
In fact the virtio devices specified in the appendices do not require PCI
at all: they have been implemented on non-PCI buses.
\begin_inset Foot
status open
\begin_layout Plain Layout
The Linux implementation further separates the PCI virtio code from the
specific virtio drivers: these drivers are shared with the non-PCI implementati
ons (currently lguest and S/390).
\end_layout
\end_inset
\end_layout
\begin_layout Description
Extensible: Virtio PCI devices contain feature bits which are acknowledged
by the guest operating system during device setup.
This allows forwards and backwards compatibility: the device offers all
the features it knows about, and the driver acknowledges those it understands
and wishes to use.
\end_layout
\begin_layout Section
Virtqueues
\end_layout
\begin_layout Standard
The mechanism for bulk data transport on virtio PCI devices is pretentiously
called a virtqueue.
Each device can have zero or more virtqueues: for example, the network
device has one for transmit and one for receive.
\end_layout
\begin_layout Standard
Each virtqueue occupies two or more physically-contiguous pages (defined,
for the purposes of this specification, as 4096 bytes), and consists of
three parts:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="1" columns="4">
<features tabularvalignment="middle">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Descriptor Table
\end_layout
\end_inset
</cell>
<cell multicolumn="1" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Available Ring
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\begin_inset space ~
\end_inset
\emph on
(padding)
\end_layout
\end_inset
</cell>
<cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Used Ring
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
When the driver wants to send a buffer to the device, it fills in a slot
in the descriptor table (or chains several together), and writes the descriptor
index into the available ring.
It then notifies the device.
When the device has finished a buffer, it writes the descriptor into the
used ring, and sends an interrupt.
\end_layout
\begin_layout Chapter
Specification
\end_layout
\begin_layout Section
PCI Discovery
\end_layout
\begin_layout Standard
Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 through 0x103F
inclusive is a virtio device
\begin_inset Foot
status open
\begin_layout Plain Layout
The actual value within this range is ignored
\end_layout
\end_inset
.
The device must also have a Revision ID of 0 to match this specification.
\end_layout
\begin_layout Standard
The Subsystem Device ID indicates which virtio device is supported by the
device.
The Subsystem Vendor ID should reflect the PCI Vendor ID of the environment
(it's currently only used for informational purposes by the guest).
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="9" columns="3">
<features tabularvalignment="middle">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="bottom" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Subsystem Device ID
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Virtio Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Specification
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
1
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
network card
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix C
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
2
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
block device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix D
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
3
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
console
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix E
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
4
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
entropy source
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix F
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
5
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
memory ballooning
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Appendix G
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
6
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
ioMemory
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
-
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\change_inserted -608949062 1323409038
7
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\change_inserted -608949062 1323409050
rpmsg
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\change_inserted -608949062 1323409055
-
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\change_inserted 1531152142 1328438958
8
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\change_inserted 1531152142 1322650855
SCSI host
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\change_inserted 1531152142 1322650861
Appendix H
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Section
Device Configuration
\end_layout
\begin_layout Standard
To configure the device, we use the first I/O region of the PCI device.
This contains a
\emph on
virtio header
\emph default
followed by a
\emph on
device-specific region.
\end_layout
\begin_layout Standard
There may be different widths of accesses to the I/O region; the
\begin_inset Quotes eld
\end_inset
natural
\begin_inset Quotes erd
\end_inset
access method for each field in the virtio header must be used (i.e.
32-bit accesses for 32-bit fields, etc), but the device-specific region
can be accessed using any width accesses, and should obtain the same results.
\end_layout
\begin_layout Standard
Note that this is possible because while the virtio header is PCI (i.e.
little) endian, the device-specific region is encoded in the native endian
of the guest (where such distinction is applicable).
\end_layout
\begin_layout Subsection
Device Initialization Sequence
\begin_inset CommandInset label
LatexCommand label
name "sub:Device-Initialization-Sequence"
\end_inset
\end_layout
\begin_layout Standard
We start with an overview of device initialization, then expand on the details
of the device and how each step is preformed.
\end_layout
\begin_layout Enumerate
Reset the device.
This is not required on initial start up.
\end_layout
\begin_layout Enumerate
The ACKNOWLEDGE status bit is set: we have noticed the device.
\end_layout
\begin_layout Enumerate
The DRIVER status bit is set: we know how to drive the device.
\end_layout
\begin_layout Enumerate
Device-specific setup, including reading the Device Feature Bits, discovery
of virtqueues for the device, optional MSI-X setup, and reading and possibly
writing the virtio configuration space.
\end_layout
\begin_layout Enumerate
The subset of Device Feature Bits understood by the driver is written to
the device.
\end_layout
\begin_layout Enumerate
The DRIVER_OK status bit is set.
\end_layout
\begin_layout Enumerate
The device can now be used (ie.
buffers added to the virtqueues)
\begin_inset Foot
status open
\begin_layout Plain Layout
Historically, drivers have used the device before steps 5 and 6.
This is only allowed if the driver does not use any features which would
alter this early use of the device.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
If any of these steps go irrecoverably wrong, the guest should set the FAILED
status bit to indicate that it has given up on the device (it can reset
the device later to restart if desired).
\end_layout
\begin_layout Standard
We now cover the fields required for general setup in detail.
\end_layout
\begin_layout Subsection
Virtio Header
\end_layout
\begin_layout Standard
The virtio header looks as follows:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="9">
<features tabularvalignment="middle">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
32
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
8
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
8
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Guest
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
ISR
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Features bits 0:31
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Features bits 0:31
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Address
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Size
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Select
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Notify
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Status
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Status
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
If MSI-X is enabled for the device, two additional fields immediately follow
this header:
\begin_inset Foot
status collapsed
\begin_layout Plain Layout
ie.
once you enable MSI-X on the device, the other fields move.
If you turn it off again, they move back!
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="3">
<features tabularvalignment="middle">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
16
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
R+W
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Configuration
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Queue
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
(MSI-X)
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Vector
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Vector
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Standard
Immediately following these general headers, there may be device-specific
headers:
\end_layout
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="4" columns="2">
<features tabularvalignment="middle">
<column alignment="left" valignment="top" width="0">
<column alignment="left" valignment="top" width="0">
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Bits
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Device Specific
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Read/Write
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Device Specific
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
Purpose
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\size footnotesize
Device Specific...
\end_layout
\end_inset
</cell>
</row>
<row>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
\begin_inset Text
\begin_layout Plain Layout
\end_layout
\end_inset
</cell>
</row>
</lyxtabular>
\end_inset
\end_layout
\begin_layout Subsubsection
Device Status
\end_layout
\begin_layout Standard
The Device Status field is updated by the guest to indicate its progress.
This provides a simple low-level diagnostic: it's most useful to imagine
them hooked up to traffic lights on the console indicating the status of
each device.
\end_layout
\begin_layout Standard
The device can be reset by writing a 0 to this field, otherwise at least
one bit should be set:
\end_layout
\begin_layout Description
ACKNOWLEDGE
\begin_inset space ~
\end_inset
(1) Indicates that the guest OS has found the device and recognized it as
a valid virtio device.
\end_layout
\begin_layout Description
DRIVER
\begin_inset space ~
\end_inset
(2) Indicates that the guest OS knows how to drive the device.
Under Linux, drivers can be loadable modules so there may be a significant
(or infinite) delay before setting this bit.
\end_layout
\begin_layout Description
DRIVER_OK
\begin_inset space ~
\end_inset
(4) Indicates that the driver is set up and ready to drive the device.
\end_layout
\begin_layout Description
FAILED
\begin_inset space ~
\end_inset
(128) Indicates that something went wrong in the guest, and it has given
up on the device.
This could be an internal error, or the driver didn't like the device for
some reason, or even a fatal error during device operation.
The device must be reset before attempting to re-initialize.
\end_layout
\begin_layout Subsubsection
Feature Bits
\begin_inset CommandInset label
LatexCommand label
name "sub:Feature-Bits"
\end_inset
\end_layout
\begin_layout Standard
Thefirst configuration field indicates the features that the device supports.
The bits are allocated as follows:
\end_layout
\begin_layout Description
0
\begin_inset space ~
\end_inset
to
\begin_inset space ~
\end_inset
23 Feature bits for the specific device type
\end_layout
\begin_layout Description
24
\begin_inset space \space{}
\end_inset
to
\begin_inset space ~
\end_inset
32 Feature bits reserved for extensions to the queue and feature negotiation
mechanisms
\end_layout
\begin_layout Standard
For example, feature bit 0 for a network device (i.e.
Subsystem Device ID 1) indicates that the device supports checksumming
of packets.
\end_layout
\begin_layout Standard
The feature bits are
\emph on
negotiated:
\emph default
the device lists all the features it understands in the Device Features
field, and the guest writes the subset that it understands into the Guest
Features field.
The only way to renegotiate is to reset the device.
\end_layout
\begin_layout Standard
In particular, new fields in the device configuration header are indicated
by offering a feature bit, so the guest can check before accessing that
part of the configuration space.
\end_layout
\begin_layout Standard
This allows for forwards and backwards compatibility: if the device is enhanced
with a new feature bit, older guests will not write that feature bit back
to the Guest Features field and it can go into backwards compatibility
mode.
Similarly, if a guest is enhanced with a feature that the device doesn't
support, it will not see that feature bit in the Device Features field
and can go into backwards compatibility mode (or, for poor implementations,
set the FAILED Device Status bit).
\end_layout
\begin_layout Subsubsection
Configuration/Queue Vectors
\end_layout
\begin_layout Standard
When MSI-X capability is present and enabled in the device (through standard
PCI configuration space) 4 bytes at byte offset 20 are used to map configuratio
n change and queue interrupts to MSI-X vectors.
In this case, the ISR Status field is unused, and device specific configuration
starts at byte offset 24 in virtio header structure.
When MSI-X capability is not enabled, device specific configuration starts
at byte offset 20 in virtio header.
\end_layout
\begin_layout Standard
Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of Configuration/Qu
eue Vector registers,
\emph on
maps
\emph default
interrupts triggered by the configuration change/selected queue events
respectively to the corresponding MSI-X vector.
To disable interrupts for a specific event type, unmap it by writing a
special NO_VECTOR value:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
/* Vector value used to disable MSI for queue */
\end_layout
\begin_layout Plain Layout
#define VIRTIO_MSI_NO_VECTOR 0xffff
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Reading these registers returns vector mapped to a given event, or NO_VECTOR
if unmapped.
All queue and configuration change events are unmapped by default.
\end_layout
\begin_layout Standard
Note that mapping an event to vector might require allocating internal device
resources, and might fail.
Devices report such failures by returning the NO_VECTOR value when the
relevant Vector field is read.
After mapping an event to vector, the driver must verify success by reading
the Vector field value: on success, the previously written value is returned,
and on failure, NO_VECTOR is returned.
If a mapping failure is detected, the driver can retry mapping with fewervector
s, or disable MSI-X.
\end_layout
\begin_layout Section
Virtqueue Configuration
\begin_inset CommandInset label
LatexCommand label
name "sec:Virtqueue-Configuration"
\end_inset
\end_layout
\begin_layout Standard
As a device can have zero or more virtqueues for bulk data transport (for
example, the network driver has two), the driver needs to configure them
as part of the device-specific configuration.
\end_layout
\begin_layout Standard
This is done as follows, for each virtqueue a device has:
\end_layout
\begin_layout Enumerate
Write the virtqueue index (first queue is 0) to the Queue Select field.
\end_layout
\begin_layout Enumerate
Read the virtqueue size from the Queue Size field, which is always a power
of 2.
This controls how big the virtqueue is (see below).
If this field is 0, the virtqueue does not exist.
\end_layout
\begin_layout Enumerate
Allocate and zero virtqueue in contiguous physical memory, on a 4096 byte
alignment.
Write the physical address, divided by 4096 to the Queue Address field.
\begin_inset Foot
status open
\begin_layout Plain Layout
The 4096 is based on the x86 page size, but it's also large enough to ensure
that the separate parts of the virtqueue are on separate cache lines.
\end_layout
\end_inset
\end_layout
\begin_layout Enumerate
Optionally, if MSI-X capability is present and enabled on the device, select
a vector to use to request interrupts triggered by virtqueue events.
Write the MSI-X Table entry number corresponding to this vector in Queue
Vector field.
Read the Queue Vector field: on success, previously written value is returned;
on failure, NO_VECTOR value is returned.
\end_layout
\begin_layout Standard
The Queue Size field controls the total number of bytes required for the
virtqueue according to the following formula:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#define ALIGN(x) (((x) + 4095) & ~4095)
\end_layout
\begin_layout Plain Layout
static inline unsigned vring_size(unsigned int qsz)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 + qsz))
\end_layout
\begin_layout Plain Layout
+ ALIGN(sizeof(struct vring_used_elem)*qsz);
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_layout Standard
This currently wastes some space with padding, but also allows future extensions.
The virtqueue layout structure looks like this (qsz is the Queue Size field,
which is a variable, so this code won't compile):
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring {
\end_layout
\begin_layout Plain Layout
/* The actual descriptors (16 bytes each) */
\end_layout
\begin_layout Plain Layout
struct vring_desc desc[qsz];
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* A ring of available descriptor heads with free-running index.
*/
\end_layout
\begin_layout Plain Layout
struct vring_avail avail;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
// Padding to the next 4096 boundary.
\end_layout
\begin_layout Plain Layout
char pad[];
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
// A ring of used descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
struct vring_used used;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
A Note on Virtqueue Endianness
\end_layout
\begin_layout Standard
Note that the
\emph on
endian
\emph default
of these fields and everything else in the virtqueue is the native endian
of the guest, not little-endian as PCI normally is.
This makes for simpler guest code, and it is assumed that the host already
has to be deeply aware of the guest endian so such an
\begin_inset Quotes eld
\end_inset
endian-aware
\begin_inset Quotes erd
\end_inset
device is not a significant issue.
\end_layout
\begin_layout Subsection
Descriptor Table
\end_layout
\begin_layout Standard
The descriptor table refers to the buffers the guest is using for the device.
The addresses are physical addresses, and the buffers can be chained via
the next field.
Each descriptor describes a buffer which is read-only or write-only, but
a chain of descriptors can contain both read-only and write-only buffers.
\end_layout
\begin_layout Standard
No descriptor chain may be more than 2^32 bytes long in total.
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring_desc {
\end_layout
\begin_layout Plain Layout
/* Address (guest-physical).
*/
\end_layout
\begin_layout Plain Layout
u64 addr;
\end_layout
\begin_layout Plain Layout
/* Length.
*/
\end_layout
\begin_layout Plain Layout
u32 len;
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as continuing via the next field.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_NEXT 1
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as write-only (otherwise read-only).
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_WRITE 2
\end_layout
\begin_layout Plain Layout
/* This means the buffer contains a list of buffer descriptors.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_INDIRECT 4
\end_layout
\begin_layout Plain Layout
/* The flags as indicated above.
*/
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
/* Next field if flags & NEXT */
\end_layout
\begin_layout Plain Layout
u16 next;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The number of descriptors in the table is specified by the Queue Size field
for this virtqueue.
\end_layout
\begin_layout Subsection
\begin_inset CommandInset label
LatexCommand label
name "sub:Indirect-Descriptors"
\end_inset
Indirect Descriptors
\end_layout
\begin_layout Standard
Some devices benefit by concurrently dispatching a large number of large
requests.
The VIRTIO_RING_F_INDIRECT_DESC feature can be used to allow this (see
\begin_inset CommandInset ref
LatexCommand ref
reference "cha:Reserved-Feature-Bits"
\end_inset
).
To increase ring capacity it is possible to store a table of
\emph on
indirect descriptors
\emph default
anywhere in memory, and insert a descriptor in main virtqueue (with flags&INDIR
ECT on) that refers to memory buffer containing this
\emph on
indirect descriptor table
\emph default
; fields
\emph on
addr
\emph default
and
\emph on
len
\emph default
refer to the indirect table address and length in bytes, respectively.
The indirect table layout structure looks like this (len is the length
of the descriptor that refers to this table, which is a variable, so this
code won't compile):
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct indirect_descriptor_table {
\end_layout
\begin_layout Plain Layout
/* The actual descriptors (16 bytes each) */
\end_layout
\begin_layout Plain Layout
struct vring_desc desc[len / 16];
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The first indirect descriptor is located at start of the indirect descriptor
table (index 0), additional indirect descriptors are chained by next field.
An indirect descriptor without next field (with flags&NEXT off) signals
the end of the indirect descriptor table, and transfers control back to
the main virtqueue.
An indirect descriptor can not refer to another indirect descriptor table
(flags&INDIRECT must be off).
A single indirect descriptor table can include both read-only and write-only
descriptors; write-only flag (flags&WRITE) in the descriptor that refers
to it is ignored.
\end_layout
\begin_layout Subsection
Available Ring
\end_layout
\begin_layout Standard
The available ring refers to what descriptors we are offering the device:
it refers to the head of a descriptor chain.
The
\begin_inset Quotes eld
\end_inset
flags
\begin_inset Quotes erd
\end_inset
field is currently 0 or 1: 1 indicating that we do not need an interrupt
when the device consumes a descriptor from the available ring.
Alternatively, the guest can ask the device to delay interrupts until an
entry with an index specified by the
\begin_inset Quotes eld
\end_inset
used_event
\begin_inset Quotes erd
\end_inset
field is written in the used ring (equivalently, until the
\emph on
idx
\emph default
field in the used ring will reach the value
\emph on
used_event + 1
\emph default
).
The method employed by the device is controlled by the VIRTIO_RING_F_EVENT_IDX
feature bit (see
\begin_inset CommandInset ref
LatexCommand ref
reference "cha:Reserved-Feature-Bits"
\end_inset
).
This interrupt suppression is merely an optimization; it may not suppress
interrupts entirely.
\end_layout
\begin_layout Standard
The
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field indicates where we would put the
\emph on
next
\emph default
descriptor entry (modulo the ring size).
This starts at 0, and increases.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
struct vring_avail {
\end_layout
\begin_layout Plain Layout
#define VRING_AVAIL_F_NO_INTERRUPT 1
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
u16 idx;
\end_layout
\begin_layout Plain Layout
u16 ring[qsz]; /* qsz is the Queue Size field read from device */
\end_layout
\begin_layout Plain Layout
u16 used_event;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Used Ring
\end_layout
\begin_layout Standard
The used ring is where the device returns buffers once it is done with them.
The flags field can be used by the device to hint that no notification
is necessary when the guest adds to the
\emph on
available
\emph default
ring.
Alternatively, the
\begin_inset Quotes eld
\end_inset
avail_event
\begin_inset Quotes erd
\end_inset
field can be used by the device to hint that no notification is necessary
until an entry with an index specified by the
\begin_inset Quotes eld
\end_inset
avail_event
\begin_inset Quotes erd
\end_inset
is written in the available ring (equivalently, until the
\emph on
idx
\emph default
field in the available ring will reach the value
\emph on
avail_event + 1
\emph default
).
The method employed by the device is controlled by the guest through the
VIRTIO_RING_F_EVENT_IDX feature bit (see
\begin_inset CommandInset ref
LatexCommand ref
reference "cha:Reserved-Feature-Bits"
\end_inset
).
\begin_inset Foot
status open
\begin_layout Plain Layout
These fields are kept here because this is the only part of the virtqueue
written by the device
\end_layout
\end_inset
.
\end_layout
\begin_layout Standard
Each entry in the ring is a pair: the head entry of the descriptor chain
describing the buffer (this matches an entry placed in the available ring
by the guest earlier), and the total of bytes written into the buffer.
The latter is extremely useful for guests using untrusted buffers: if you
do not know exactly how much has been written by the device, you usually
have to zero the buffer to ensure no data leakage occurs.
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
/* u32 is used here for ids for padding reasons.
*/
\end_layout
\begin_layout Plain Layout
struct vring_used_elem {
\end_layout
\begin_layout Plain Layout
/* Index of start of used descriptor chain.
*/
\end_layout
\begin_layout Plain Layout
u32 id;
\end_layout
\begin_layout Plain Layout
/* Total length of the descriptor chain which was used (written to)
*/
\end_layout
\begin_layout Plain Layout
u32 len;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_used {
\end_layout
\begin_layout Plain Layout
#define VRING_USED_F_NO_NOTIFY 1
\end_layout
\begin_layout Plain Layout
u16 flags;
\end_layout
\begin_layout Plain Layout
u16 idx;
\end_layout
\begin_layout Plain Layout
struct vring_used_elem ring[qsz];
\end_layout
\begin_layout Plain Layout
u16 avail_event;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Helpers for Managing Virtqueues
\end_layout
\begin_layout Standard
The Linux Kernel Source code contains the definitions above and helper routines
in a more usable form, in include/linux/virtio_ring.h.
This was explicitly licensed by IBM and Red Hat under the (3-clause) BSD
license so that it can be freely used by all other projects, and is reproduced
(with slight variation to remove Linux assumptions) in Appendix A.
\end_layout
\begin_layout Section
Device Operation
\begin_inset CommandInset label
LatexCommand label
name "sec:Device-Operation"
\end_inset
\end_layout
\begin_layout Standard
There are two parts to device operation: supplying new buffers to the device,
and processing used buffers from the device.
As an example, the virtio network device has two virtqueues: the transmit
virtqueue and the receive virtqueue.
The driver adds outgoing (read-only) packets to the transmit virtqueue,
and then frees them after they are used.
Similarly, incoming (write-only) buffers are added to the receive virtqueue,
and processed after they are used.
\end_layout
\begin_layout Subsection
Supplying Buffers to The Device
\end_layout
\begin_layout Standard
Actual transfer of buffers from the guest OS to the device operates as follows:
\end_layout
\begin_layout Enumerate
Place the buffer(s) into free descriptor(s).
\end_layout
\begin_deeper
\begin_layout Enumerate
If there are no free descriptors, the guest may choose to notify the device
even if notifications are suppressed (to reduce latency).
\begin_inset Foot
status open
\begin_layout Plain Layout
The Linux drivers do this only for read-only buffers: for write-only buffers,
it is assumed that the driver is merely trying to keep the receive buffer
ring full, and no notification of this expected condition is necessary.
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
Place the id of the buffer in the next ring entry of the available ring.
\end_layout
\begin_layout Enumerate
The steps (1) and (2) may be performed repeatedly if batching is possible.
\end_layout
\begin_layout Enumerate
A memory barrier should be executed to ensure the device sees the updated
descriptor table and available ring before the next step.
\end_layout
\begin_layout Enumerate
The available
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field should be increased by the number of entries added to the available
ring.
\end_layout
\begin_layout Enumerate
A memory barrier should be executed to ensure that we update the idx field
before checking for notification suppression.
\end_layout
\begin_layout Enumerate
If notifications are not suppressed, the device should be notified of the
new buffers.
\end_layout
\begin_layout Standard
Note that the above code does not take precautions against the available
ring buffer wrapping around: this is not possible since the ring buffer
is the same size as the descriptor table, so step (1) will prevent such
a condition.
\end_layout
\begin_layout Standard
In addition, the maximum queue size is 32768 (it must be a power of 2 which
fits in 16 bits), so the 16-bit
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
value can always distinguish between a full and empty buffer.
\end_layout
\begin_layout Standard
Here is a description of each stage in more detail.
\end_layout
\begin_layout Subsubsection
Placing Buffers Into The Descriptor Table
\end_layout
\begin_layout Standard
A buffer consists of zero or more read-only physically-contiguous elements
followed by zero or more physically-contiguous write-only elements (it
must have at least one element).
This algorithm maps it into the descriptor table:
\end_layout
\begin_layout Enumerate
for each buffer element,
\family typewriter
b
\family default
:
\end_layout
\begin_deeper
\begin_layout Enumerate
Get the next free descriptor table entry,
\family typewriter
d
\end_layout
\begin_layout Enumerate
Set
\family typewriter
d.addr
\family default
to the physical address of the start of
\family typewriter
b
\end_layout
\begin_layout Enumerate
Set
\family typewriter
d.len
\family default
to the length of
\family typewriter
b
\family default
.
\end_layout
\begin_layout Enumerate
If
\family typewriter
b
\family default
is write-only, set
\family typewriter
d.flags
\family default
to VRING_DESC_F_WRITE, otherwise 0.
\end_layout
\begin_layout Enumerate
If there is a buffer element after this:
\end_layout
\begin_deeper
\begin_layout Enumerate
Set
\family typewriter
d.next
\family default
to the index of the next free descriptor element.
\end_layout
\begin_layout Enumerate
Set the VRING_DESC_F_NEXT bit in
\family typewriter
d.flags
\family default
.
\end_layout
\end_deeper
\end_deeper
\begin_layout Standard
In practice, the d.next fields are usually used to chain free descriptors,
and a separate count kept to check there are enough free descriptors before
beginning the mappings.
\end_layout
\begin_layout Subsubsection
Updating The Available Ring
\end_layout
\begin_layout Standard
The head of the buffer we mapped is the first
\family typewriter
d
\family default
in the algorithm above.
A naive implementation would do the following:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->ring[avail->idx % qsz] = head;
\end_layout
\end_inset
\end_layout
\begin_layout Standard
However, in general we can add many descriptors before we update the
\begin_inset Quotes eld
\end_inset
idx
\begin_inset Quotes erd
\end_inset
field (at which point they become visible to the device), so we keep a
counter of how many we've added:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->ring[(avail->idx + added++) % qsz] = head;
\end_layout
\end_inset
\end_layout
\begin_layout Subsubsection
Updating The Index Field
\end_layout
\begin_layout Standard
Once the idx field of the virtqueue is updated, the device will be able
to access the descriptor entries we've created and the memory they refer
to.
This is why a memory barrier is generally used before the idx update, to
ensure it sees the most up-to-date copy.
\end_layout
\begin_layout Standard
The idx field always increments, and we let it wrap naturally at 65536:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
avail->idx += added;
\end_layout
\end_inset
\end_layout
\begin_layout Subsubsection
\begin_inset CommandInset label
LatexCommand label
name "sub:Notifying-The-Device"
\end_inset
Notifying The Device
\end_layout
\begin_layout Standard
Device notification occurs by writing the 16-bit virtqueue index of this
virtqueue to the Queue Notify field of the virtio header in the first I/O
region of the PCI device.
This can be expensive, however, so the device can suppress such notifications
if it doesn't need them.
We have to be careful to expose the new idx value
\emph on
before
\emph default
checking the suppression flag: it's OK to notify gratuitously, but not
to omit a required notification.
So again, we use a memory barrier here before reading the flags or the
avail_event field.
\end_layout
\begin_layout Standard
If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if the VRING_USED_
F_NOTIFY flag is not set, we go ahead and write to the PCI configuration
space.
\end_layout
\begin_layout Standard
If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the avail_event
field in the available ring structure.
If the available index crossed_the
\emph on
avail_event
\emph default
field value since the last notification, we go ahead and write to the PCI
configuration space.
The
\emph on
avail_event
\emph default
field wraps naturally at 65536 as well:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx)
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
\begin_inset CommandInset label
LatexCommand label
name "sub:Receiving-Used-Buffers"
\end_inset
Receiving Used Buffers From The Device
\end_layout
\begin_layout Standard
Once the device has used a buffer (read from or written to it, or parts
of both, depending on the nature of the virtqueue and the device), it sends
an interrupt, following an algorithm very similar to the algorithm used
for the driver to send the device a buffer:
\end_layout
\begin_layout Enumerate
Write the head descriptor number to the next field in the used ring.
\end_layout
\begin_layout Enumerate
Update the used ring idx.
\end_layout
\begin_layout Enumerate
Determine whether an interrupt is necessary:
\end_layout
\begin_deeper
\begin_layout Enumerate
If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check if f the
VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail\SpecialChar \nobreakdash-
>flags
\end_layout
\begin_layout Enumerate
If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check whether the
used index crossed the
\emph on
used_event
\emph default
field value since the last update.
The
\emph on
used_event
\emph default
field wraps naturally at 65536 as well:
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx)
\end_layout
\end_inset
\end_layout
\end_deeper
\begin_layout Enumerate
If an interrupt is necessary:
\end_layout
\begin_deeper
\begin_layout Enumerate
If MSI-X capability is disabled:
\end_layout
\begin_deeper
\begin_layout Enumerate
Set the lower bit of the ISR Status field for the device.
\end_layout
\begin_layout Enumerate
Send the appropriate PCI interrupt for the device.
\end_layout
\end_deeper
\begin_layout Enumerate
If MSI-X capability is enabled:
\end_layout
\begin_deeper
\begin_layout Enumerate
Request the appropriate MSI-X interrupt message for the device, Queue Vector
field sets the MSI-X Table entry number.
\end_layout
\begin_layout Enumerate
If Queue Vector field value is NO_VECTOR, no interrupt message is requested
for this event.
\end_layout
\end_deeper
\end_deeper
\begin_layout Standard
The guest interrupt handler should:
\end_layout
\begin_layout Enumerate
If MSI-X capability is disabled: read the ISR Status field, which will reset
it to zero.
If the lower bit is zero, the interrupt was not for this device.
Otherwise, the guest driver should look through the used rings of each
virtqueue for the device, to see if any progress has been made by the device
which requires servicing.
\end_layout
\begin_layout Enumerate
If MSI-X capability is enabled: look through the used rings of each virtqueue
mapped to the specific MSI-X vector for the device, to see if any progress
has been made by the device which requires servicing.
\end_layout
\begin_layout Standard
For each ring, guest should then disable interrupts by writing VRING_AVAIL_F_NO_
INTERRUPT flag in avail structure, if required.
It can then process used ring entries finally enabling interrupts by clearing
the VRING_AVAIL_F_NO_INTERRUPT flag or updating the EVENT_IDX field in
the available structure, Guest should then execute a memory barrier, and
then recheck the ring empty condition.
This is necessary to handle the case where, after the last check and before
enabling interrupts, an interrupt has been suppressed by the device:
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
vring_disable_interrupts(vq);
\end_layout
\begin_layout Plain Layout
for (;;) {
\end_layout
\begin_layout Plain Layout
if (vq->last_seen_used != vring->used.idx) {
\end_layout
\begin_layout Plain Layout
vring_enable_interrupts(vq);
\end_layout
\begin_layout Plain Layout
mb();
\end_layout
\begin_layout Plain Layout
if (vq->last_seen_used != vring->used.idx)
\end_layout
\begin_layout Plain Layout
break;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
struct vring_used_elem *e = vring.used->ring[vq->last_seen_used%vsz];
\end_layout
\begin_layout Plain Layout
process_buffer(e);
\end_layout
\begin_layout Plain Layout
vq->last_seen_used++;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\end_inset
\end_layout
\begin_layout Subsection
Dealing With Configuration Changes
\begin_inset CommandInset label
LatexCommand label
name "sub:Dealing-With-Configuration"
\end_inset
\end_layout
\begin_layout Standard
Some virtio PCI devices can change the device configuration state, as reflected
in the virtio header in the PCI configuration space.
In this case:
\end_layout
\begin_layout Enumerate
If MSI-X capability is disabled: an interrupt is delivered and the second
highest bit is set in the ISR Status field to indicate that the driver
should re-examine the configuration space.Note that a single interrupt can
indicate both that one or more virtqueue has been used and that the configurati
on space has changed: even if the config bit is set, virtqueues must be
scanned.
\end_layout
\begin_layout Enumerate
If MSI-X capability is enabled: an interrupt message is requested.
The Configuration Vector field sets the MSI-X Table entry number to use.
If Configuration Vector field value is NO_VECTOR, no interrupt message
is requested for this event.
\end_layout
\begin_layout Chapter
Creating New Device Types
\end_layout
\begin_layout Standard
Various considerations are necessary when creating a new device type:
\end_layout
\begin_layout Section*
How Many Virtqueues?
\end_layout
\begin_layout Standard
It is possible that a very simple device will operate entirely through its
configuration space, but most will need at least one virtqueue in which
it will place requests.
A device with both input and output (eg.
console and network devices described here) need two queues: one which
the driver fills with buffers to receive input, and one which the driver
places buffers to transmit output.
\end_layout
\begin_layout Section*
What Configuration Space Layout?
\end_layout
\begin_layout Standard
Configuration space is generally used for rarely-changing or initialization-time
parameters.
But it is a limited resource, so it might be better to use a virtqueue
to update configuration information (the network device does this for filtering
, otherwise the table in the config space could potentially be very large).
\end_layout
\begin_layout Standard
Note that this space is generally the guest's native endian, rather than
PCI's little-endian.
\end_layout
\begin_layout Section*
What Device Number?
\end_layout
\begin_layout Standard
Currently device numbers are assigned quite freely: a simple request mail
to the author of this document or the Linux virtualization mailing list
\begin_inset Foot
status open
\begin_layout Plain Layout
https://lists.linux-foundation.org/mailman/listinfo/virtualization
\end_layout
\end_inset
will be sufficient to secure a unique one.
\end_layout
\begin_layout Standard
Meanwhile for experimental drivers, use 65535 and work backwards.
\end_layout
\begin_layout Section*
How many MSI-X vectors?
\end_layout
\begin_layout Standard
Using the optional MSI-X capability devices can speed up interrupt processing
by removing the need to read ISR Status register by guest driver (which
might be an expensive operation), reducing interrupt sharing between devices
and queues within the device, and handling interrupts from multiple CPUs.
However, some systems impose a limit (which might be as low as 256) on
the total number of MSI-X vectors that can be allocated to all devices.
Devices and/or device drivers should take this into account, limiting the
number of vectors used unless the device is expected to cause a high volume
of interrupts.
Devices can control the number of vectors used by limiting the MSI-X Table
Size or not presenting MSI-X capability in PCI configuration space.
Drivers can control this by mapping events to as small number of vectors
as possible, or disabling MSI-X capability altogether.
\end_layout
\begin_layout Section*
Message Framing
\end_layout
\begin_layout Standard
The descriptors used for a buffer should not effect the semantics of the
message, except for the total length of the buffer.
For example, a network buffer consists of a 10 byte header followed by
the network packet.
Whether this is presented in the ring descriptor chain as (say) a 10 byte
buffer and a 1514 byte buffer, or a single 1524 byte buffer, or even three
buffers, should have no effect.
\end_layout
\begin_layout Standard
In particular, no implementation should use the descriptor boundaries to
determine the size of any header in a request.
\begin_inset Foot
status open
\begin_layout Plain Layout
The current qemu device implementations mistakenly insist that the first
descriptor cover the header in these cases exactly, so a cautious driver
should arrange it so.
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Device Improvements
\end_layout
\begin_layout Standard
Any change to configuration space, or new virtqueues, or behavioural changes,
should be indicated by negotiation of a new feature bit.
This establishes clarity
\begin_inset Foot
status open
\begin_layout Plain Layout
Even if it does mean documenting design or implementation mistakes!
\end_layout
\end_inset
and avoids future expansion problems.
\end_layout
\begin_layout Standard
Clusters of functionality which are always implemented together can use
a single bit, but if one feature makes sense without the others they should
not be gratuitously grouped together to conserve feature bits.
We can always extend the spec when the first person needs more than 24
feature bits for their device.
\end_layout
\begin_layout Standard
\begin_inset CommandInset nomencl_print
LatexCommand printnomenclature
set_width "none"
\end_inset
\end_layout
\begin_layout Chapter*
Appendix A: virtio_ring.h
\end_layout
\begin_layout Standard
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
#ifndef VIRTIO_RING_H
\end_layout
\begin_layout Plain Layout
#define VIRTIO_RING_H
\end_layout
\begin_layout Plain Layout
/* An interface for efficient virtio implementation.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* This header is BSD licensed so anyone can use the definitions
\end_layout
\begin_layout Plain Layout
* to implement compatible drivers/servers.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* Copyright 2007, 2009, IBM Corporation
\end_layout
\begin_layout Plain Layout
* Copyright 2011, Red Hat, Inc
\end_layout
\begin_layout Plain Layout
* All rights reserved.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* Redistribution and use in source and binary forms, with or without
\end_layout
\begin_layout Plain Layout
* modification, are permitted provided that the following conditions
\end_layout
\begin_layout Plain Layout
* are met:
\end_layout
\begin_layout Plain Layout
* 1.
Redistributions of source code must retain the above copyright
\end_layout
\begin_layout Plain Layout
* notice, this list of conditions and the following disclaimer.
\end_layout
\begin_layout Plain Layout
* 2.
Redistributions in binary form must reproduce the above copyright
\end_layout
\begin_layout Plain Layout
* notice, this list of conditions and the following disclaimer in the
\end_layout
\begin_layout Plain Layout
* documentation and/or other materials provided with the distribution.
\end_layout
\begin_layout Plain Layout
* 3.
Neither the name of IBM nor the names of its contributors
\end_layout
\begin_layout Plain Layout
* may be used to endorse or promote products derived from this software
\end_layout
\begin_layout Plain Layout
* without specific prior written permission.
\end_layout
\begin_layout Plain Layout
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
IS'' AND
\end_layout
\begin_layout Plain Layout
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
\end_layout
\begin_layout Plain Layout
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
\end_layout
\begin_layout Plain Layout
* ARE DISCLAIMED.
IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
\end_layout
\begin_layout Plain Layout
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
\end_layout
\begin_layout Plain Layout
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
\end_layout
\begin_layout Plain Layout
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
\end_layout
\begin_layout Plain Layout
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
\end_layout
\begin_layout Plain Layout
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
WAY
\end_layout
\begin_layout Plain Layout
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
\end_layout
\begin_layout Plain Layout
* SUCH DAMAGE.
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as continuing via the next field.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_NEXT 1
\end_layout
\begin_layout Plain Layout
/* This marks a buffer as write-only (otherwise read-only).
*/
\end_layout
\begin_layout Plain Layout
#define VRING_DESC_F_WRITE 2
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* The Host uses this in used->flags to advise the Guest: don't kick me
\end_layout
\begin_layout Plain Layout
* when you add a buffer.
It's unreliable, so it's simply an
\end_layout
\begin_layout Plain Layout
* optimization.
Guest will still kick if it's out of buffers.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_USED_F_NO_NOTIFY 1
\end_layout
\begin_layout Plain Layout
/* The Guest uses this in avail->flags to advise the Host: don't
\end_layout
\begin_layout Plain Layout
* interrupt me when you consume a buffer.
It's unreliable, so it's
\end_layout
\begin_layout Plain Layout
* simply an optimization.
*/
\end_layout
\begin_layout Plain Layout
#define VRING_AVAIL_F_NO_INTERRUPT 1
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* Virtio ring descriptors: 16 bytes.
\end_layout
\begin_layout Plain Layout
* These can chain together via "next".
*/
\end_layout
\begin_layout Plain Layout
struct vring_desc {
\end_layout
\begin_layout Plain Layout
/* Address (guest-physical).
*/
\end_layout
\begin_layout Plain Layout
uint64_t addr;
\end_layout
\begin_layout Plain Layout
/* Length.
*/
\end_layout
\begin_layout Plain Layout
uint32_t len;
\end_layout
\begin_layout Plain Layout
/* The flags as indicated above.
*/
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
/* We chain unused descriptors via this, too */
\end_layout
\begin_layout Plain Layout
uint16_t next;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_avail {
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
uint16_t idx;
\end_layout
\begin_layout Plain Layout
uint16_t ring[];
\end_layout
\begin_layout Plain Layout
uint16_t used_event;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* u32 is used here for ids for padding reasons.
*/
\end_layout
\begin_layout Plain Layout
struct vring_used_elem {
\end_layout
\begin_layout Plain Layout
/* Index of start of used descriptor chain.
*/
\end_layout
\begin_layout Plain Layout
uint32_t id;
\end_layout
\begin_layout Plain Layout
/* Total length of the descriptor chain which was written to.
*/
\end_layout
\begin_layout Plain Layout
uint32_t len;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_used {
\end_layout
\begin_layout Plain Layout
uint16_t flags;
\end_layout
\begin_layout Plain Layout
uint16_t idx;
\end_layout
\begin_layout Plain Layout
struct vring_used_elem ring[];
\end_layout
\begin_layout Plain Layout
uint16_t avail_event;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring {
\end_layout
\begin_layout Plain Layout
unsigned int num;
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
struct vring_desc *desc;
\end_layout
\begin_layout Plain Layout
struct vring_avail *avail;
\end_layout
\begin_layout Plain Layout
struct vring_used *used;
\end_layout
\begin_layout Plain Layout
};
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
/* The standard layout for the ring is a continuous chunk of memory which
\end_layout
\begin_layout Plain Layout
* looks like this.
We assume num is a power of 2.
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* struct vring {
\end_layout
\begin_layout Plain Layout
* // The actual descriptors (16 bytes each)
\end_layout
\begin_layout Plain Layout
* struct vring_desc desc[num];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // A ring of available descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
* __u16 avail_flags;
\end_layout
\begin_layout Plain Layout
* __u16 avail_idx;
\end_layout
\begin_layout Plain Layout
* __u16 available[num];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // Padding to the next align boundary.
\end_layout
\begin_layout Plain Layout
* char pad[];
\end_layout
\begin_layout Plain Layout
*
\end_layout
\begin_layout Plain Layout
* // A ring of used descriptor heads with free-running index.
\end_layout
\begin_layout Plain Layout
* __u16 used_flags;
\end_layout
\begin_layout Plain Layout
* __u16 EVENT_IDX;
\end_layout
\begin_layout Plain Layout
* struct vring_used_elem used[num];
\end_layout
\begin_layout Plain Layout
* };
\end_layout
\begin_layout Plain Layout
* Note: for virtio PCI, align is 4096.
\end_layout
\begin_layout Plain Layout
*/
\end_layout
\begin_layout Plain Layout
static inline void vring_init(struct vring *vr, unsigned int num, void *p,
\end_layout
\begin_layout Plain Layout
unsigned long align)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
vr->num = num;
\end_layout
\begin_layout Plain Layout
vr->desc = p;
\end_layout
\begin_layout Plain Layout
vr->avail = p + num*sizeof(struct vring_desc);
\end_layout
\begin_layout Plain Layout
vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
\end_layout
\begin_layout Plain Layout
+ align-1)
\end_layout
\begin_layout Plain Layout
& ~(align - 1));
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
static inline unsigned vring_size(unsigned int num, unsigned long align)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
return ((sizeof(struct vring_desc)*num + sizeof(uint16_t)*(2+num)
\end_layout
\begin_layout Plain Layout
+ align - 1) & ~(align - 1))
\end_layout
\begin_layout Plain Layout
+ sizeof(uint16_t)*3 + sizeof(struct vring_used_elem)*num;
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
\end_layout
\begin_layout Plain Layout
static inline int vring_need_event(uint16_t event_idx, uint16_t new_idx,
uint16_t old_idx)
\end_layout
\begin_layout Plain Layout
{
\end_layout
\begin_layout Plain Layout
return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx
- old_idx);
\end_layout
\begin_layout Plain Layout
}
\end_layout
\begin_layout Plain Layout
#endif /* VIRTIO_RING_H */
\end_layout
\end_inset
\end_layout
\begin_layout Chapter*
\begin_inset CommandInset label
LatexCommand label
name "cha:Reserved-Feature-Bits"
\end_inset
Appendix B: Reserved Feature Bits
\end_layout
\begin_layout Standard
Currently there are five device-independent feature bits defined:
\end_layout
\begin_layout Description
VIRTIO_F_NOTIFY_ON_EMPTY
\begin_inset space ~
\end_inset
(24) Negotiating this feature indicates that the driver wants an interrupt
if the device runs out of available descriptors on a virtqueue, even though
interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT flag or
the used_event field.
An example of this is the networking driver: it doesn't need to know every
time a packet is transmitted, but it does need to free the transmitted
packets a finite time after they are transmitted.
It can avoid using a timer if the device interrupts it when all the packets
are transmitted.
\end_layout
\begin_layout Description
VIRTIO_F_RING_INDIRECT_DESC
\begin_inset space ~
\end_inset
(28) Negotiating this feature indicates that the driver can use descriptors
with the VRING_DESC_F_INDIRECT flag set, as described in
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:Indirect-Descriptors"
\end_inset
.
\end_layout
\begin_layout Description
VIRTIO_F_RING_EVENT_IDX(29) This feature enables the
\emph on
used_event
\emph default
and the
\emph on
avail_event
\emph default
fields.
If set, it indicates that the device should ignore the
\emph on
flags
\emph default
field in the available ring structure.
Instead, the
\emph on
used_event
\emph default
field in this structure is used by guest to suppress device interrupts.
Further, the driver should ignore the
\emph on
flags
\emph default
field in the used ring structure.
Instead, the
\emph on
avail_event
\emph default
field in this structure is used by the device to suppress notifications.
If unset, the driver should ignore the
\emph on
used_event
\emph default
field; the device should ignore the
\emph on
avail_event
\emph default
field; the
\emph on
flags
\emph default
field is used
\end_layout
\begin_layout Chapter*
Appendix C: Network Device
\end_layout
\begin_layout Standard
The virtio network device is a virtual ethernet card, and is the most complex
of the devices supported so far by virtio.
It has enhanced rapidly and demonstrates clearly how support for new features
should be added to an existing device.
Empty buffers are placed in one virtqueue for receiving packets, and outgoing
packets are enqueued into another for transmission in that order.
A third command queue is used to control advanced filtering features.
\end_layout
\begin_layout Section*
Configuration
\end_layout
\begin_layout Description
Subsystem
\begin_inset space ~
\end_inset
Device
\begin_inset space ~
\end_inset
ID 1
\end_layout
\begin_layout Description
Virtqueues 0:receiveq.
1:transmitq.
2:controlq
\begin_inset Foot
status open
\begin_layout Plain Layout
Only if VIRTIO_NET_F_CTRL_VQ set
\end_layout
\end_inset
\end_layout
\begin_layout Description
Feature
\begin_inset space ~
\end_inset
bits
\end_layout
\begin_deeper
\begin_layout Description
VIRTIO_NET_F_CSUM
\begin_inset space ~
\end_inset
(0) Device handles packets with partial checksum
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_CSUM
\begin_inset space ~
\end_inset
(1) Guest handles packets with partial checksum
\end_layout
\begin_layout Description
VIRTIO_NET_F_MAC
\begin_inset space ~
\end_inset
(5) Device has given MAC address.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GSO
\begin_inset space ~
\end_inset
(6) (Deprecated) device handles packets with any GSO type.
\begin_inset Foot
status open
\begin_layout Plain Layout
It was supposed to indicate segmentation offload support, but upon further
investigation it became clear that multiple bits were required.
\end_layout
\end_inset
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_TSO4
\begin_inset space ~
\end_inset
(7) Guest can receive TSOv4.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_TSO6
\begin_inset space ~
\end_inset
(8) Guest can receive TSOv6.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_ECN
\begin_inset space ~
\end_inset
(9) Guest can receive TSO with ECN.
\end_layout
\begin_layout Description
VIRTIO_NET_F_GUEST_UFO
\begin_inset space ~
\end_inset