| .\" Copyright 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> |
| .\" |
| .\" %%%LICENSE_START(VERBATIM) |
| .\" Permission is granted to make and distribute verbatim copies of this |
| .\" manual provided the copyright notice and this permission notice are |
| .\" preserved on all copies. |
| .\" |
| .\" Permission is granted to copy and distribute modified versions of this |
| .\" manual under the conditions for verbatim copying, provided that the |
| .\" entire resulting derived work is distributed under the terms of a |
| .\" permission notice identical to this one. |
| .\" |
| .\" Since the Linux kernel and libraries are constantly changing, this |
| .\" manual page may be incorrect or out-of-date. The author(s) assume no |
| .\" responsibility for errors or omissions, or for damages resulting from |
| .\" the use of the information contained herein. The author(s) may not |
| .\" have taken the same level of care in the production of this manual, |
| .\" which is licensed free of charge, as they might when working |
| .\" professionally. |
| .\" |
| .\" Formatted or processed versions of this manual, if unaccompanied by |
| .\" the source, must acknowledge the copyright and authors of this work. |
| .\" %%%LICENSE_END |
| .\" |
| .TH MEMBARRIER 2 2015-12-28 "Linux" "Linux Programmer's Manual" |
| .SH NAME |
| membarrier \- issue memory barriers on a set of threads |
| .SH SYNOPSIS |
| .B #include <linux/membarrier.h> |
| .sp |
| .BI "int membarrier(int " cmd ", int " flags "); |
| .SH DESCRIPTION |
| The |
| .BR membarrier () |
| system call helps reducing the overhead of the memory barrier |
| instructions required to order memory accesses on multi-core systems. |
| However, this system call is heavier than a memory barrier, so using it |
| effectively is |
| .I not |
| as simple as replacing memory barriers with this |
| system call, but requires understanding of the details below. |
| |
| Use of memory barriers needs to be done taking into account that a |
| memory barrier always needs to be either matched with its memory barrier |
| counterparts, or that the architecture's memory model doesn't require the |
| matching barriers. |
| |
| There are cases where one side of the matching barriers (which we will |
| refer to as "fast side") is executed much more often than the other |
| (which we will refer to as "slow side"). |
| This is a prime target for the use of |
| .BR membarrier (). |
| The key idea is to replace, for these matching |
| barriers, the fast-side memory barriers by simple compiler barriers, |
| for example: |
| |
| asm volatile ("" : : : "memory") |
| |
| and replace the slow-side memory barriers by calls to |
| .BR membarrier (). |
| |
| This will add overhead to the slow side, and remove overhead from the |
| fast side, thus resulting in an overall performance increase as long as |
| the slow side is infrequent enough that the overhead of the |
| .BR membarrier () |
| calls does not outweigh the performance gain on the fast side. |
| |
| The |
| .I cmd |
| argument is one of the following: |
| .TP |
| .B MEMBARRIER_CMD_QUERY |
| Query the set of supported commands. |
| The return value of the call is a bit mask of supported |
| commands. |
| .BR MEMBARRIER_CMD_QUERY , |
| which has the value 0, |
| is not itself included in this bit mask. |
| This command is always supported (on kernels where |
| .BR membarrier () |
| is provided). |
| .TP |
| .B MEMBARRIER_CMD_SHARED |
| Ensure that all threads from all processes on the system pass through a |
| state where all memory accesses to user-space addresses match program |
| order between entry to and return from the |
| .BR membarrier () |
| system call. |
| All threads on the system are targeted by this command. |
| .PP |
| The |
| .I flags |
| argument is currently unused and must be specified as 0. |
| .PP |
| All memory accesses performed in program order from each targeted thread |
| are guaranteed to be ordered with respect to |
| .BR membarrier (). |
| |
| If we use the semantic |
| .I barrier() |
| to represent a compiler barrier forcing memory |
| accesses to be performed in program order across the barrier, and |
| .I smp_mb() |
| to represent explicit memory barriers forcing full memory |
| ordering across the barrier, we have the following ordering table for |
| each pairing of |
| .IR barrier() , |
| .BR membarrier () |
| and |
| .IR smp_mb() . |
| The pair ordering is detailed as (O: ordered, X: not ordered): |
| |
| barrier() smp_mb() membarrier() |
| barrier() X X O |
| smp_mb() X O O |
| membarrier() O O O |
| .SH RETURN VALUE |
| On success, the |
| .B MEMBARRIER_CMD_QUERY |
| operation returns a bit mask of supported commands and the |
| .B MEMBARRIER_CMD_SHARED |
| operation returns zero. |
| On error, \-1 is returned, |
| and |
| .I errno |
| is set appropriately. |
| |
| For a given command, with |
| .I flags |
| set to 0, this system call is |
| guaranteed to always return the same value until reboot. |
| Further calls with the same arguments will lead to the same result. |
| Therefore, with |
| .I flags |
| set to 0, error handling is required only for the first call to |
| .BR membarrier (). |
| .SH ERRORS |
| .TP |
| .B EINVAL |
| .I cmd |
| is invalid or |
| .I flags |
| is non-zero. |
| .TP |
| .B ENOSYS |
| The |
| .BR membarrier () |
| system call is not implemented by this kernel. |
| .SH VERSIONS |
| The |
| .BR membarrier () |
| system call was added in Linux 4.3. |
| .\" |
| .SH CONFORMING TO |
| .BR membarrier () |
| is Linux-specific. |
| .SH NOTES |
| A memory barrier instruction is part of the instruction set of |
| architectures with weakly-ordered memory models. |
| It orders memory |
| accesses prior to the barrier and after the barrier with respect to |
| matching barriers on other cores. |
| For instance, a load fence can order |
| loads prior to and following that fence with respect to stores ordered |
| by store fences. |
| |
| Program order is the order in which instructions are ordered in the |
| program assembly code. |
| |
| Examples where |
| .BR membarrier () |
| can be useful include implementations |
| of Read-Copy-Update libraries and garbage collectors. |
| .SH EXAMPLE |
| Assuming a multithreaded application where "fast_path()" is executed |
| very frequently, and where "slow_path()" is executed infrequently, the |
| following code (x86) can be transformed using |
| .BR membarrier (): |
| |
| .in +4n |
| .nf |
| #include <stdlib.h> |
| |
| static volatile int a, b; |
| |
| static void |
| fast_path(void) |
| { |
| int read_a, read_b; |
| |
| read_b = b; |
| asm volatile ("mfence" : : : "memory"); |
| read_a = a; |
| |
| /* read_b == 1 implies read_a == 1. */ |
| |
| if (read_b == 1 && read_a == 0) |
| abort(); |
| } |
| |
| static void |
| slow_path(void) |
| { |
| a = 1; |
| asm volatile ("mfence" : : : "memory"); |
| b = 1; |
| } |
| |
| int |
| main(int argc, char **argv) |
| { |
| /* |
| * Real applications would call fast_path() and slow_path() |
| * from different threads. Call those from main() to keep |
| * this example short. |
| */ |
| |
| slow_path(); |
| fast_path(); |
| |
| exit(EXIT_SUCCESS); |
| } |
| .fi |
| .in |
| |
| The code above transformed to use |
| .BR membarrier () |
| becomes: |
| |
| .in +4n |
| .nf |
| #define _GNU_SOURCE |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <unistd.h> |
| #include <sys/syscall.h> |
| #include <linux/membarrier.h> |
| |
| static volatile int a, b; |
| |
| static int |
| membarrier(int cmd, int flags) |
| { |
| return syscall(__NR_membarrier, cmd, flags); |
| } |
| |
| static int |
| init_membarrier(void) |
| { |
| int ret; |
| |
| /* Check that membarrier() is supported. */ |
| |
| ret = membarrier(MEMBARRIER_CMD_QUERY, 0); |
| if (ret < 0) { |
| perror("membarrier"); |
| return \-1; |
| } |
| |
| if (!(ret & MEMBARRIER_CMD_SHARED)) { |
| fprintf(stderr, |
| "membarrier does not support MEMBARRIER_CMD_SHARED\\n"); |
| return \-1; |
| } |
| |
| return 0; |
| } |
| |
| static void |
| fast_path(void) |
| { |
| int read_a, read_b; |
| |
| read_b = b; |
| asm volatile ("" : : : "memory"); |
| read_a = a; |
| |
| /* read_b == 1 implies read_a == 1. */ |
| |
| if (read_b == 1 && read_a == 0) |
| abort(); |
| } |
| |
| static void |
| slow_path(void) |
| { |
| a = 1; |
| membarrier(MEMBARRIER_CMD_SHARED, 0); |
| b = 1; |
| } |
| |
| int |
| main(int argc, char **argv) |
| { |
| if (init_membarrier()) |
| exit(EXIT_FAILURE); |
| |
| /* |
| * Real applications would call fast_path() and slow_path() |
| * from different threads. Call those from main() to keep |
| * this example short. |
| */ |
| |
| slow_path(); |
| fast_path(); |
| |
| exit(EXIT_SUCCESS); |
| } |
| .fi |
| .in |