proc, pidns: Add highpid

Pid reuse is common, which means that it's difficult or impossible
to read information about a pid from /proc without races.

This introduces a second number associated with each (task, pidns)
pair called highpid.  Highpid is a 64-bit number, and, barring
extremely unlikely circumstances or outright error, a (highpid, pid)
will never be reused.

With just this change, a program can open /proc/PID/status, read the
"Highpid" field, and confirm that it has the expected value.  If the
pid has been reused, then highpid will be different.

The initial implementation is straightforward: highpid is simply a
64-bit counter. If a high-end system can fork every 3 ns (which
would be amazing, given that just allocating a pid requires at
atomic operation), it would take well over 1000 years for highpid to
wrap.

For CRIU's benefit, the next highpid can be set by a privileged
user.

NB: The sysctl stuff only works on 64-bit systems.  If the approach
looks good, I'll fix that somehow.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cd3653e..f1e0e69 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -159,6 +159,7 @@
 	int g;
 	struct fdtable *fdt = NULL;
 	const struct cred *cred;
+	const struct upid *upid;
 	pid_t ppid, tpid;
 
 	rcu_read_lock();
@@ -170,12 +171,14 @@
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
+	upid = pid_upid_ns(pid, ns);
 	cred = get_task_cred(p);
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
 		"Ngid:\t%d\n"
 		"Pid:\t%d\n"
+		"Highpid:\t%llu\n"
 		"PPid:\t%d\n"
 		"TracerPid:\t%d\n"
 		"Uid:\t%d\t%d\t%d\t%d\n"
@@ -183,7 +186,7 @@
 		get_task_state(p),
 		task_tgid_nr_ns(p, ns),
 		task_numa_group_id(p),
-		pid_nr_ns(pid, ns),
+		upid ? upid->nr : 0, upid ? upid->highnr : 0,
 		ppid, tpid,
 		from_kuid_munged(user_ns, cred->uid),
 		from_kuid_munged(user_ns, cred->euid),
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 23705a5..ece70b6 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -51,6 +51,7 @@
 	/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
 	int nr;
 	struct pid_namespace *ns;
+	u64 highnr;
 	struct hlist_node pid_chain;
 };
 
@@ -170,6 +171,7 @@
 }
 
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
+const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns);
 pid_t pid_vnr(struct pid *pid);
 
 #define do_each_pid_task(pid, type, task)				\
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 1997ffc..fe414ec 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -20,12 +20,19 @@
 
 struct bsd_acct_struct;
 
+/* We don't want a highpid to ever be a valid pid. */
+#define MIN_HIGHPID ((u64)(1ULL << 32))
+
+/* We don't want a highpid to ever look like an error code. */
+#define MAX_HIGHPID ((u64)(-1ULL - 4096))
+
 struct pid_namespace {
 	struct kref kref;
 	struct pidmap pidmap[PIDMAP_ENTRIES];
 	struct rcu_head rcu;
 	int last_pid;
 	unsigned int nr_hashed;
+	atomic64_t next_highpid;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a266..9cdfbb6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -76,6 +76,7 @@
 	},
 	.last_pid = 0,
 	.nr_hashed = PIDNS_HASH_ADDING,
+	.next_highpid = ATOMIC64_INIT(MIN_HIGHPID),
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
@@ -291,6 +292,29 @@
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
+static u64 alloc_highpid(struct pid_namespace *ns)
+{
+	u64 prev, old, new;
+	u64 nr = atomic64_inc_return(&ns->next_highpid) - 1;
+
+	if (likely(nr >= MIN_HIGHPID && nr <= MAX_HIGHPID))
+		return nr;
+
+	/*
+	 * Atomically increase next_highpid to something between
+	 * MIN_HIGHPID + 1 and MAX_HIGHPID + 1 and return new - 1.
+	 */
+	prev = nr + 1;
+	do {
+		old = prev;
+		new = old + 1;
+		if (new < MIN_HIGHPID + 1 || new > MAX_HIGHPID + 1)
+			new = MIN_HIGHPID + 1;
+		prev = atomic64_cmpxchg(&ns->next_highpid, old, new);
+	} while (prev != old);
+	return new - 1;
+}
+
 struct pid *alloc_pid(struct pid_namespace *ns)
 {
 	struct pid *pid;
@@ -312,6 +336,7 @@
 
 		pid->numbers[i].nr = nr;
 		pid->numbers[i].ns = tmp;
+		pid->numbers[i].highnr = alloc_highpid(tmp);
 		tmp = tmp->parent;
 	}
 
@@ -492,17 +517,26 @@
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
 
-pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
+const struct upid *pid_upid_ns(struct pid *pid, struct pid_namespace *ns)
 {
 	struct upid *upid;
-	pid_t nr = 0;
 
 	if (pid && ns->level <= pid->level) {
 		upid = &pid->numbers[ns->level];
 		if (upid->ns == ns)
-			nr = upid->nr;
+			return upid;
 	}
-	return nr;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pid_upid_ns);
+
+pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
+{
+	const struct upid *upid = pid_upid_ns(pid, ns);
+
+	if (!upid)
+		return 0;
+	return upid->nr;
 }
 EXPORT_SYMBOL_GPL(pid_nr_ns);
 
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8e..cbbaa14 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -114,6 +114,7 @@
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->nr_hashed = PIDNS_HASH_ADDING;
+	atomic64_set(&ns->next_highpid, MIN_HIGHPID);
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	set_bit(0, ns->pidmap[0].page);
@@ -268,6 +269,22 @@
 	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }
 
+static int pid_ns_next_highpid_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct pid_namespace *pid_ns = task_active_pid_ns(current);
+	struct ctl_table tmp = *table;
+
+	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* This needs to be fixed. */
+	BUILD_BUG_ON(sizeof(u64) != sizeof(unsigned long));
+
+	tmp.data = &pid_ns->next_highpid;
+	return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
 extern int pid_max;
 static int zero = 0;
 static struct ctl_table pid_ns_ctl_table[] = {
@@ -279,6 +296,12 @@
 		.extra1 = &zero,
 		.extra2 = &pid_max,
 	},
+	{
+		.procname = "ns_next_highpid",
+		.maxlen = sizeof(u64),
+		.mode = 0666, /* permissions are checked in the handler */
+		.proc_handler = pid_ns_next_highpid_handler,
+	},
 	{ }
 };
 static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };