libcap/cap_proc.c - pub/scm/libs/libcap/libcap - Git at Google

 /*
  * Copyright (c) 1997-8,2007,11,19,20 Andrew G Morgan <morgan@kernel.org>
  *
  * This file deals with getting and setting capabilities on processes.
  */

 #define _GNU_SOURCE

 #include <errno.h>
 #include <fcntl.h>              /* Obtain O_* constant definitions */
 #include <grp.h>
 #include <sys/prctl.h>
 #include <sys/psx_syscall.h>
 #include <sys/securebits.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/wait.h>

 #include <linux/limits.h>

 #include "libcap.h"

 /*
  * libcap uses this abstraction for all system calls that change
  * kernel managed capability state. This permits the user to redirect
  * it for testing and also to better implement posix semantics when
  * using pthreads.
  */

 static long int _cap_syscall3(long int syscall_nr,
 			      long int arg1, long int arg2, long int arg3)
 {
     return syscall(syscall_nr, arg1, arg2, arg3);
 }

 static long int _cap_syscall6(long int syscall_nr,
 			      long int arg1, long int arg2, long int arg3,
 			      long int arg4, long int arg5, long int arg6)
 {
     return syscall(syscall_nr, arg1, arg2, arg3, arg4, arg5, arg6);
 }

 /*
  * to keep the structure of the code conceptually similar in C and Go
  * implementations, we introduce this abstraction for invoking state
  * writing system calls. In psx+pthreaded code, the fork
  * implementation provided by nptl ensures that we can consistently
  * use the multithreaded syscalls even in the child after a fork().
  */
 struct syscaller_s {
     long int (*three)(long int syscall_nr,
 		      long int arg1, long int arg2, long int arg3);
     long int (*six)(long int syscall_nr,
 		    long int arg1, long int arg2, long int arg3,
 		    long int arg4, long int arg5, long int arg6);
 };

 /* use this syscaller for multi-threaded code */
 static struct syscaller_s multithread = {
     .three = _cap_syscall3,
     .six = _cap_syscall6
 };

 /* use this syscaller for single-threaded code */
 static struct syscaller_s singlethread = {
     .three = _cap_syscall3,
     .six = _cap_syscall6
 };

 /*
  * This gets reset to 0 if we are *not* linked with libpsx.
  */
 static int _libcap_overrode_syscalls = 1;

 /*
  * psx_load_syscalls() is weakly defined so we can have it overriden
  * by libpsx if that library is linked. Specifically, when libcap
  * calls psx_load_sycalls() it is prepared to override the default
  * values for the syscalls that libcap uses to change security state.
  * As can be seen here this present function is mostly a
  * no-op. However, if libpsx is linked, the one present in that
  * library (not being weak) will replace this one and the
  * _libcap_overrode_syscalls value isn't forced to zero.
  */
 __attribute__((weak))
 void psx_load_syscalls(long int (**syscall_fn)(long int,
 					      long int, long int, long int),
 		       long int (**syscall6_fn)(long int,
 					       long int, long int, long int,
 					       long int, long int, long int))
 {
     _libcap_overrode_syscalls = 0;
 }

 /*
  * cap_set_syscall overrides the state setting syscalls that libcap does.
  * Generally, you don't need to call this manually: libcap tries hard to
  * set things up appropriately.
  */
 void cap_set_syscall(long int (*new_syscall)(long int,
 					     long int, long int, long int),
 			    long int (*new_syscall6)(long int, long int,
 						     long int, long int,
 						     long int, long int,
 						     long int)) {
     if (new_syscall == NULL) {
 	psx_load_syscalls(&multithread.three, &multithread.six);
     } else {
 	multithread.three = new_syscall;
 	multithread.six = new_syscall6;
     }
 }

 static int _libcap_capset(struct syscaller_s *sc,
 			  cap_user_header_t header, const cap_user_data_t data)
 {
     if (_libcap_overrode_syscalls) {
 	return sc->three(SYS_capset, (long int) header, (long int) data, 0);
     }
     return capset(header, data);
 }

 static int _libcap_wprctl3(struct syscaller_s *sc,
 			   long int pr_cmd, long int arg1, long int arg2)
 {
     if (_libcap_overrode_syscalls) {
 	return sc->three(SYS_prctl, pr_cmd, arg1, arg2);
     }
     return prctl(pr_cmd, arg1, arg2, 0, 0, 0);
 }

 static int _libcap_wprctl6(struct syscaller_s *sc,
 			   long int pr_cmd, long int arg1, long int arg2,
 			   long int arg3, long int arg4, long int arg5)
 {
     if (_libcap_overrode_syscalls) {
 	return sc->six(SYS_prctl, pr_cmd, arg1, arg2, arg3, arg4, arg5);
     }
     return prctl(pr_cmd, arg1, arg2, arg3, arg4, arg5);
 }

 /*
  * cap_get_proc obtains the capability set for the current process.
  */
 cap_t cap_get_proc(void)
 {
     cap_t result;

     /* allocate a new capability set */
     result = cap_init();
     if (result) {
 	_cap_debug("getting current process' capabilities");

 	/* fill the capability sets via a system call */
 	if (capget(&result->head, &result->u[0].set)) {
 	    cap_free(result);
 	    result = NULL;
 	}
     }

     return result;
 }

 static int _cap_set_proc(struct syscaller_s *sc, cap_t cap_d) {
     int retval;

     if (!good_cap_t(cap_d)) {
 	errno = EINVAL;
 	return -1;
     }

     _cap_debug("setting process capabilities");
     retval = _libcap_capset(sc, &cap_d->head, &cap_d->u[0].set);

     return retval;
 }

 int cap_set_proc(cap_t cap_d)
 {
     return _cap_set_proc(&multithread, cap_d);
 }

 /* the following two functions are not required by POSIX */

 /* read the caps on a specific process */

 int capgetp(pid_t pid, cap_t cap_d)
 {
     int error;

     if (!good_cap_t(cap_d)) {
 	errno = EINVAL;
 	return -1;
     }

     _cap_debug("getting process capabilities for proc %d", pid);

     cap_d->head.pid = pid;
     error = capget(&cap_d->head, &cap_d->u[0].set);
     cap_d->head.pid = 0;

     return error;
 }

 /* allocate space for and return capabilities of target process */

 cap_t cap_get_pid(pid_t pid)
 {
     cap_t result;

     result = cap_init();
     if (result) {
 	if (capgetp(pid, result) != 0) {
 	    int my_errno;

 	    my_errno = errno;
 	    cap_free(result);
 	    errno = my_errno;
 	    result = NULL;
 	}
     }

     return result;
 }

 /*
  * set the caps on a specific process/pg etc.. The kernel has long
  * since deprecated this asynchronus interface. DON'T EXPECT THIS TO
  * EVER WORK AGAIN.
  */

 int capsetp(pid_t pid, cap_t cap_d)
 {
     int error;

     if (!good_cap_t(cap_d)) {
 	errno = EINVAL;
 	return -1;
     }

     _cap_debug("setting process capabilities for proc %d", pid);
     cap_d->head.pid = pid;
     error = capset(&cap_d->head, &cap_d->u[0].set);
     cap_d->head.version = _LIBCAP_CAPABILITY_VERSION;
     cap_d->head.pid = 0;

     return error;
 }

 /* the kernel api requires unsigned long arguments */
 #define pr_arg(x) ((unsigned long) x)

 /* get a capability from the bounding set */

 int cap_get_bound(cap_value_t cap)
 {
     int result;

     result = prctl(PR_CAPBSET_READ, pr_arg(cap), pr_arg(0));
     if (result < 0) {
 	errno = -result;
 	return -1;
     }
     return result;
 }

 static int _cap_drop_bound(struct syscaller_s *sc, cap_value_t cap)
 {
     int result;

     result = _libcap_wprctl3(sc, PR_CAPBSET_DROP, pr_arg(cap), pr_arg(0));
     if (result < 0) {
 	errno = -result;
 	return -1;
     }
     return result;
 }

 /* drop a capability from the bounding set */

 int cap_drop_bound(cap_value_t cap) {
     return _cap_drop_bound(&multithread, cap);
 }

 /* get a capability from the ambient set */

 int cap_get_ambient(cap_value_t cap)
 {
     int result;
     result = prctl(PR_CAP_AMBIENT, pr_arg(PR_CAP_AMBIENT_IS_SET),
 		   pr_arg(cap), pr_arg(0), pr_arg(0));
     if (result < 0) {
 	errno = -result;
 	return -1;
     }
     return result;
 }

 static int _cap_set_ambient(struct syscaller_s *sc,
 			    cap_value_t cap, cap_flag_value_t set)
 {
     int result, val;
     switch (set) {
     case CAP_SET:
 	val = PR_CAP_AMBIENT_RAISE;
 	break;
     case CAP_CLEAR:
 	val = PR_CAP_AMBIENT_LOWER;
 	break;
     default:
 	errno = EINVAL;
 	return -1;
     }
     result = _libcap_wprctl6(sc, PR_CAP_AMBIENT, pr_arg(val), pr_arg(cap),
 			     pr_arg(0), pr_arg(0), pr_arg(0));
     if (result < 0) {
 	errno = -result;
 	return -1;
     }
     return result;
 }

 /*
  * cap_set_ambient modifies a single ambient capability value.
  */
 int cap_set_ambient(cap_value_t cap, cap_flag_value_t set)
 {
     return _cap_set_ambient(&multithread, cap, set);
 }

 static int _cap_reset_ambient(struct syscaller_s *sc)
 {
     int olderrno = errno;
     cap_value_t c;
     int result = 0;

     for (c = 0; !result; c++) {
 	result = cap_get_ambient(c);
 	if (result == -1) {
 	    errno = olderrno;
 	    return 0;
 	}
     }

     result = _libcap_wprctl6(sc, PR_CAP_AMBIENT,
 			     pr_arg(PR_CAP_AMBIENT_CLEAR_ALL),
 			     pr_arg(0), pr_arg(0), pr_arg(0), pr_arg(0));
     if (result < 0) {
 	errno = -result;
 	return -1;
     }
     return result;
 }

 /*
  * cap_reset_ambient erases all ambient capabilities - this reads the
  * ambient caps before performing the erase to workaround the corner
  * case where the set is empty already but the ambient cap API is
  * locked.
  */
 int cap_reset_ambient()
 {
     return _cap_reset_ambient(&multithread);
 }

 /*
  * Read the security mode of the current process.
  */
 unsigned cap_get_secbits(void)
 {
     return (unsigned) prctl(PR_GET_SECUREBITS, pr_arg(0), pr_arg(0));
 }

 static int _cap_set_secbits(struct syscaller_s *sc, unsigned bits)
 {
     return _libcap_wprctl3(sc, PR_SET_SECUREBITS, bits, 0);
 }

 /*
  * Set the security mode of the current process.
  */
 int cap_set_secbits(unsigned bits)
 {
     return _cap_set_secbits(&multithread, bits);
 }

 /*
  * Some predefined constants
  */
 #define CAP_SECURED_BITS_BASIC                                 \
     (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED |                    \
      SECBIT_NO_SETUID_FIXUP | SECBIT_NO_SETUID_FIXUP_LOCKED |  \
      SECBIT_KEEP_CAPS_LOCKED)

 #define CAP_SECURED_BITS_AMBIENT  (CAP_SECURED_BITS_BASIC |    \
      SECBIT_NO_CAP_AMBIENT_RAISE | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED)

 static cap_value_t raise_cap_setpcap[] = {CAP_SETPCAP};

 static int _cap_set_mode(struct syscaller_s *sc, cap_mode_t flavor)
 {
     cap_t working = cap_get_proc();
     unsigned secbits = CAP_SECURED_BITS_AMBIENT;

     int ret = cap_set_flag(working, CAP_EFFECTIVE,
 			   1, raise_cap_setpcap, CAP_SET);
     ret = ret | _cap_set_proc(sc, working);

     if (ret == 0) {
 	cap_flag_t c;

 	switch (flavor) {
 	case CAP_MODE_NOPRIV:
 	    /* fall through */
 	case CAP_MODE_PURE1E_INIT:
 	    (void) cap_clear_flag(working, CAP_INHERITABLE);
 	    /* fall through */
 	case CAP_MODE_PURE1E:
 	    if (!CAP_AMBIENT_SUPPORTED()) {
 		secbits = CAP_SECURED_BITS_BASIC;
 	    } else {
 		ret = _cap_reset_ambient(sc);
 		if (ret) {
 		    break; /* ambient dropping failed */
 		}
 	    }
 	    ret = _cap_set_secbits(sc, secbits);
 	    if (flavor != CAP_MODE_NOPRIV) {
 		break;
 	    }

 	    /* just for "case CAP_MODE_NOPRIV:" */

 	    for (c = 0; cap_get_bound(c) >= 0; c++) {
 		(void) _cap_drop_bound(sc, c);
 	    }
 	    (void) cap_clear_flag(working, CAP_PERMITTED);
 	    break;
 	default:
 	    errno = EINVAL;
 	    ret = -1;
 	    break;
 	}
     }

     (void) cap_clear_flag(working, CAP_EFFECTIVE);
     ret = _cap_set_proc(sc, working) | ret;
     (void) cap_free(working);
     return ret;
 }

 /*
  * cap_set_mode locks the overarching capability framework of the
  * present process and thus its children to a predefined flavor. Once
  * set, these modes cannot be undone by the affected process tree and
  * can only be done by "cap_setpcap" permitted processes. Note, a side
  * effect of this function, whether it succeeds or fails, is to clear
  * atleast the CAP_EFFECTIVE flags for the current process.
  */
 int cap_set_mode(cap_mode_t flavor)
 {
     return _cap_set_mode(&multithread, flavor);
 }

 /*
  * cap_get_mode attempts to determine what the current capability mode
  * is. If it can find no match in the libcap pre-defined modes, it
  * returns CAP_MODE_UNCERTAIN.
  */
 cap_mode_t cap_get_mode(void)
 {
     unsigned secbits = cap_get_secbits();

     if ((secbits & CAP_SECURED_BITS_BASIC) != CAP_SECURED_BITS_BASIC) {
 	return CAP_MODE_UNCERTAIN;
     }

     /* validate ambient is not set */
     int olderrno = errno;
     int ret = 0;
     cap_value_t c;
     for (c = 0; !ret; c++) {
 	ret = cap_get_ambient(c);
 	if (ret == -1) {
 	    errno = olderrno;
 	    if (c && secbits != CAP_SECURED_BITS_AMBIENT) {
 		return CAP_MODE_UNCERTAIN;
 	    }
 	    break;
 	}
 	if (ret) {
 	    return CAP_MODE_UNCERTAIN;
 	}
     }

     cap_t working = cap_get_proc();
     cap_t empty = cap_init();
     int cf = cap_compare(empty, working);
     cap_free(empty);
     cap_free(working);

     if (CAP_DIFFERS(cf, CAP_INHERITABLE)) {
 	return CAP_MODE_PURE1E;
     }
     if (CAP_DIFFERS(cf, CAP_PERMITTED) || CAP_DIFFERS(cf, CAP_EFFECTIVE)) {
 	return CAP_MODE_PURE1E_INIT;
     }

     for (c = 0; ; c++) {
 	int v = cap_get_bound(c);
 	if (v == -1) {
 	    break;
 	}
 	if (v) {
 	    return CAP_MODE_PURE1E_INIT;
 	}
     }

     return CAP_MODE_NOPRIV;
 }

 static int _cap_setuid(struct syscaller_s *sc, uid_t uid)
 {
     const cap_value_t raise_cap_setuid[] = {CAP_SETUID};
     cap_t working = cap_get_proc();
     (void) cap_set_flag(working, CAP_EFFECTIVE,
 			1, raise_cap_setuid, CAP_SET);
     /*
      * Note, we are cognizant of not using glibc's setuid in the case
      * that we've modified the way libcap is doing setting
      * syscalls. This is because prctl needs to be working in a POSIX
      * compliant way for the code below to work, so we are either
      * all-broken or not-broken and don't allow for "sort of working".
      */
     (void) _libcap_wprctl3(sc, PR_SET_KEEPCAPS, 1, 0);
     int ret = _cap_set_proc(sc, working);
     if (ret == 0) {
 	if (_libcap_overrode_syscalls) {
 	    ret = sc->three(SYS_setuid, (long int) uid, 0, 0);
 	    if (ret < 0) {
 		errno = -ret;
 		ret = -1;
 	    }
 	} else {
 	    ret = setuid(uid);
 	}
     }
     int olderrno = errno;
     (void) _libcap_wprctl3(sc, PR_SET_KEEPCAPS, 0, 0);
     (void) cap_clear_flag(working, CAP_EFFECTIVE);
     (void) _cap_set_proc(sc, working);
     (void) cap_free(working);

     errno = olderrno;
     return ret;
 }

 /*
  * cap_setuid attempts to set the uid of the process without dropping
  * any permitted capabilities in the process. A side effect of a call
  * to this function is that the effective set will be cleared by the
  * time the function returns.
  */
 int cap_setuid(uid_t uid)
 {
     return _cap_setuid(&multithread, uid);
 }

 #if defined(__arm__) || defined(__i386__) || \
     defined(__i486__) || defined(__i586__) || defined(__i686__)
 #define sys_setgroups_variant  SYS_setgroups32
 #else
 #define sys_setgroups_variant  SYS_setgroups
 #endif

 static int _cap_setgroups(struct syscaller_s *sc,
 			  gid_t gid, size_t ngroups, const gid_t groups[])
 {
     const cap_value_t raise_cap_setgid[] = {CAP_SETGID};
     cap_t working = cap_get_proc();
     (void) cap_set_flag(working, CAP_EFFECTIVE,
 			1, raise_cap_setgid, CAP_SET);
     /*
      * Note, we are cognizant of not using glibc's setgid etc in the
      * case that we've modified the way libcap is doing setting
      * syscalls. This is because prctl needs to be working in a POSIX
      * compliant way for the other functions of this file so we are
      * all-broken or not-broken and don't allow for "sort of working".
      */
     int ret = _cap_set_proc(sc, working);
     if (_libcap_overrode_syscalls) {
 	if (ret == 0) {
 	    ret = sc->three(SYS_setgid, (long int) gid, 0, 0);
 	}
 	if (ret == 0) {
 	    ret = sc->three(sys_setgroups_variant, (long int) ngroups,
 			    (long int) groups, 0);
 	}
 	if (ret < 0) {
 	    errno = -ret;
 	    ret = -1;
 	}
     } else {
 	if (ret == 0) {
 	    ret = setgid(gid);
 	}
 	if (ret == 0) {
 	    ret = setgroups(ngroups, groups);
 	}
     }
     int olderrno = errno;

     (void) cap_clear_flag(working, CAP_EFFECTIVE);
     (void) _cap_set_proc(sc, working);
     (void) cap_free(working);

     errno = olderrno;
     return ret;
 }

 /*
  * cap_setgroups combines setting the gid with changing the set of
  * supplemental groups for a user into one call that raises the needed
  * capabilities to do it for the duration of the call. A side effect
  * of a call to this function is that the effective set will be
  * cleared by the time the function returns.
  */
 int cap_setgroups(gid_t gid, size_t ngroups, const gid_t groups[])
 {
     return _cap_setgroups(&multithread, gid, ngroups, groups);
 }

 /*
  * cap_iab_get_proc returns a cap_iab_t value initialized by the
  * current process state related to these iab bits.
  */
 cap_iab_t cap_iab_get_proc(void)
 {
     cap_iab_t iab = cap_iab_init();
     cap_t current = cap_get_proc();
     cap_iab_fill(iab, CAP_IAB_INH, current, CAP_INHERITABLE);
     cap_value_t c;
     for (c = cap_max_bits(); c; ) {
 	--c;
 	int o = c >> 5;
 	__u32 mask = 1U << (c & 31);
 	if (cap_get_bound(c) == 0) {
 	    iab->nb[o] |= mask;
 	}
 	if (cap_get_ambient(c) == 1) {
 	    iab->a[o] |= mask;
 	}
     }
     return iab;
 }

 /*
  * _cap_iab_set_proc sets the iab collection using the requested syscaller.
  */
 static int _cap_iab_set_proc(struct syscaller_s *sc, cap_iab_t iab)
 {
     int ret, i;
     cap_t working, temp = cap_get_proc();
     cap_value_t c;
     int raising = 0;

     for (i = 0; i < _LIBCAP_CAPABILITY_U32S; i++) {
 	__u32 newI = iab->i[i];
 	__u32 oldIP = temp->u[i].flat[CAP_INHERITABLE] |
 	    temp->u[i].flat[CAP_PERMITTED];
 	raising |= (newI & ~oldIP) | iab->a[i] | iab->nb[i];
 	temp->u[i].flat[CAP_INHERITABLE] = newI;

     }

     working = cap_dup(temp);
     if (raising) {
 	ret = cap_set_flag(working, CAP_EFFECTIVE,
 			   1, raise_cap_setpcap, CAP_SET);
 	if (ret) {
 	    goto defer;
 	}
     }
     if ((ret = _cap_set_proc(sc, working))) {
 	goto defer;
     }
     if ((ret = _cap_reset_ambient(sc))) {
 	goto done;
     }

     for (c = cap_max_bits(); c-- != 0; ) {
 	unsigned offset = c >> 5;
 	__u32 mask = 1U << (c & 31);
 	if (iab->a[offset] & mask) {
 	    ret = _cap_set_ambient(sc, c, CAP_SET);
 	    if (ret) {
 		goto done;
 	    }
 	}
 	if (iab->nb[offset] & mask) {
 	    /* drop the bounding bit */
 	    ret = _cap_drop_bound(sc, c);
 	    if (ret) {
 		goto done;
 	    }
 	}
     }

 done:
     (void) cap_set_proc(temp);

 defer:
     cap_free(working);
     cap_free(temp);

     return ret;
 }

 /*
  * cap_iab_set_proc sets the iab capability vectors of the current
  * process.
  */
 int cap_iab_set_proc(cap_iab_t iab)
 {
     return _cap_iab_set_proc(&multithread, iab);
 }

 /*
  * cap_launcher_callback primes the launcher with a callback that will
  * be invoked after the fork() but before any privilege has changed
  * and before the execve(). This can be used to augment the state of
  * the child process within the cap_launch() process. You can cancel
  * any callback associated with a launcher by calling this function
  * with a callback_fn value NULL.
  *
  * If the callback function returns anything other than 0, it is
  * considered to have failed and the launch will be aborted - further,
  * errno will be communicated to the parent.
  */
 void cap_launcher_callback(cap_launch_t attr, int (callback_fn)(void *detail))
 {
     attr->custom_setup_fn = callback_fn;
 }

 /*
  * cap_launcher_setuid primes the launcher to attempt a change of uid.
  */
 void cap_launcher_setuid(cap_launch_t attr, uid_t uid)
 {
     attr->uid = uid;
     attr->change_uids = 1;
 }

 /*
  * cap_launcher_setgroups primes the launcher to attempt a change of
  * gid and groups.
  */
 void cap_launcher_setgroups(cap_launch_t attr, gid_t gid,
 			    int ngroups, const gid_t *groups)
 {
     attr->gid = gid;
     attr->ngroups = ngroups;
     attr->groups = groups;
     attr->change_gids = 1;
 }

 /*
  * cap_launcher_set_mode primes the launcher to attempt a change of
  * mode.
  */
 void cap_launcher_set_mode(cap_launch_t attr, cap_mode_t flavor)
 {
     attr->mode = flavor;
     attr->change_mode = 1;
 }

 cap_iab_t cap_launcher_set_iab(cap_launch_t attr, cap_iab_t bits)
 {
     cap_iab_t old = attr->iab;
     attr->iab = bits;
     return old;
 }

 /*
  * cap_launcher_set_chroot sets the intended chroot for the launched
  * child.
  */
 void cap_launcher_set_chroot(cap_launch_t attr, const char *chroot)
 {
     attr->chroot = _libcap_strdup(chroot);
 }

 static int _cap_chroot(struct syscaller_s *sc, const char *root)
 {
     const cap_value_t raise_cap_sys_chroot[] = {CAP_SYS_CHROOT};
     cap_t working = cap_get_proc();
     (void) cap_set_flag(working, CAP_EFFECTIVE,
 			1, raise_cap_sys_chroot, CAP_SET);
     int ret = _cap_set_proc(sc, working);
     if (ret == 0) {
 	if (_libcap_overrode_syscalls) {
 	    ret = sc->three(SYS_chroot, (long int) root, 0, 0);
 	    if (ret < 0) {
 		errno = -ret;
 		ret = -1;
 	    }
 	} else {
 	    ret = chroot(root);
 	}
     }
     int olderrno = errno;
     (void) cap_clear_flag(working, CAP_EFFECTIVE);
     (void) _cap_set_proc(sc, working);
     (void) cap_free(working);

     errno = olderrno;
     return ret;
 }

 /*
  * _cap_launch is invoked in the forked child, it cannot return but is
  * required to exit. If the execve fails, it will write the errno value
  * over the filedescriptor, fd, and exit with status 0.
  */
 __attribute__ ((noreturn))
 static void _cap_launch(int fd, cap_launch_t attr, void *detail) {
     struct syscaller_s *sc = &singlethread;

     if (attr->custom_setup_fn && attr->custom_setup_fn(detail)) {
 	goto defer;
     }

     if (attr->change_uids && _cap_setuid(sc, attr->uid)) {
 	goto defer;
     }
     if (attr->change_gids &&
 	_cap_setgroups(sc, attr->gid, attr->ngroups, attr->groups)) {
 	goto defer;
     }
     if (attr->change_mode && _cap_set_mode(sc, attr->mode)) {
 	goto defer;
     }
     if (attr->iab && _cap_iab_set_proc(sc, attr->iab)) {
 	goto defer;
     }
     if (attr->chroot != NULL && _cap_chroot(sc, attr->chroot)) {
 	goto defer;
     }

     /*
      * Some type wrangling to work around what the kernel API really
      * means: not "const char **".
      */
     const void *temp_args = attr->argv;
     const void *temp_envp = attr->envp;

     execve(attr->arg0, temp_args, temp_envp);
     /* if the exec worked, execution will not reach here */

 defer:
     /*
      * getting here means an error has occurred and errno is
      * communicated to the parent
      */
     for (;;) {
 	int n = write(fd, &errno, sizeof(errno));
 	if (n < 0 && errno == EAGAIN) {
 	    continue;
 	}
 	break;
     }
     close(fd);
     exit(1);
 }

 /*
  * cap_launch performs a wrapped fork+exec that works in both an
  * unthreaded environment and also where libcap is linked with
  * psx+pthreads. The function supports dropping privilege in the
  * forked thread, but retaining privilege in the parent thread(s).
  *
  * Since the ambient set is fragile with respect to changes in I or P,
  * the function carefully orders setting of these inheritable
  * characteristics, to make sure they stick, or return an error
  * of -1 setting errno because the launch failed.
  */
 pid_t cap_launch(cap_launch_t details, void *data) {
     int my_errno;
     int ps[2];

     if (pipe2(ps, O_CLOEXEC) != 0) {
 	return -1;
     }

     int child = fork();
     my_errno = errno;

     close(ps[1]);
     if (child < 0) {
 	goto defer;
     }
     if (!child) {
 	close(ps[0]);
 	/* noreturn from this function: */
 	_cap_launch(ps[1], details, data);
     }

     /*
      * Extend this function's return codes to include setup failures
      * in the child.
      */
     for (;;) {
 	int ignored;
 	int n = read(ps[0], &my_errno, sizeof(my_errno));
 	if (n == 0) {
 	    goto defer;
 	}
 	if (n < 0 && errno == EAGAIN) {
 	    continue;
 	}
 	waitpid(child, &ignored, 0);
 	child = -1;
 	my_errno = ECHILD;
 	break;
     }

 defer:
     close(ps[0]);
     errno = my_errno;
     return (pid_t) child;
 }