blob: 79c61580a2111ac8108399cf35e20fc0d53739a6 [file] [log] [blame]
#define pr_fmt(fmt) "%s: " fmt "\n", __func__
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/percpu-refcount.h>
#include <linux/rcupdate.h>
/*
* A percpu refcount can be in 4 different modes. The state is tracked in the
* low two bits of percpu_ref->pcpu_count:
*
* PCPU_REF_NONE - the initial state, no percpu counters allocated.
*
* PCPU_REF_PTR - using percpu counters for the refcount.
*
* PCPU_REF_DYING - we're shutting down so get()/put() should use the embedded
* atomic counter, but we're not finished updating the atomic counter from the
* percpu counters - this means that percpu_ref_put() can't check for the ref
* hitting 0 yet.
*
* PCPU_REF_DEAD - we've finished the teardown sequence, percpu_ref_put() should
* now check for the ref hitting 0.
*
* In PCPU_REF_NONE mode, we need to count the number of times percpu_ref_get()
* is called; this is done with the high bits of the raw atomic counter. We also
* track the time, in jiffies, when the get count last wrapped - this is done
* with the remaining bits of percpu_ref->percpu_count.
*
* So, when percpu_ref_get() is called it increments the get count and checks if
* it wrapped; if it did, it checks if the last time it wrapped was less than
* one second ago; if so, we want to allocate percpu counters.
*
* PCPU_COUNT_BITS determines the threshold where we convert to percpu: of the
* raw 64 bit counter, we use PCPU_COUNT_BITS for the refcount, and the
* remaining (high) bits to count the number of times percpu_ref_get() has been
* called. It's currently (completely arbitrarily) 16384 times in one second.
*
* Percpu mode (PCPU_REF_PTR):
*
* In percpu mode all we do on get and put is increment or decrement the cpu
* local counter, which is a 32 bit unsigned int.
*
* Note that all the gets() could be happening on one cpu, and all the puts() on
* another - the individual cpu counters can wrap (potentially many times).
*
* But this is fine because we don't need to check for the ref hitting 0 in
* percpu mode; before we set the state to PCPU_REF_DEAD we simply sum up all
* the percpu counters and add them to the atomic counter. Since addition and
* subtraction in modular arithmatic is still associative, the result will be
* correct.
*/
#define PCPU_COUNT_BITS 50
#define PCPU_COUNT_MASK ((1LL << PCPU_COUNT_BITS) - 1)
#define PCPU_STATUS_BITS 2
#define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1)
#define PCPU_REF_PTR 0
#define PCPU_REF_NONE 1
#define PCPU_REF_DYING 2
#define PCPU_REF_DEAD 3
#define REF_STATUS(count) (count & PCPU_STATUS_MASK)
/**
* percpu_ref_init - initialize a dynamic percpu refcount
*
* Initializes the refcount in single atomic counter mode with a refcount of 1;
* analagous to atomic_set(ref, 1).
*/
void percpu_ref_init(struct percpu_ref *ref)
{
unsigned long now = jiffies;
atomic64_set(&ref->count, 1);
now <<= PCPU_STATUS_BITS;
now |= PCPU_REF_NONE;
ref->pcpu_count = now;
}
static void percpu_ref_alloc(struct percpu_ref *ref, unsigned long pcpu_count)
{
unsigned long new, now = jiffies;
now <<= PCPU_STATUS_BITS;
now |= PCPU_REF_NONE;
if (now - pcpu_count <= HZ << PCPU_STATUS_BITS) {
rcu_read_unlock();
new = (unsigned long) alloc_percpu(unsigned);
rcu_read_lock();
if (!new)
goto update_time;
BUG_ON(new & PCPU_STATUS_MASK);
if (cmpxchg(&ref->pcpu_count, pcpu_count, new) != pcpu_count)
free_percpu((void __percpu *) new);
else
pr_debug("created");
} else {
update_time:
new = now;
cmpxchg(&ref->pcpu_count, pcpu_count, new);
}
}
void __percpu_ref_get(struct percpu_ref *ref, bool alloc)
{
unsigned long pcpu_count;
uint64_t v;
pcpu_count = ACCESS_ONCE(ref->pcpu_count);
if (REF_STATUS(pcpu_count) == PCPU_REF_PTR) {
/* for rcu - we're not using rcu_dereference() */
smp_read_barrier_depends();
__this_cpu_inc(*((unsigned __percpu *) pcpu_count));
} else {
v = atomic64_add_return(1 + (1ULL << PCPU_COUNT_BITS),
&ref->count);
if (!(v >> PCPU_COUNT_BITS) &&
REF_STATUS(pcpu_count) == PCPU_REF_NONE && alloc)
percpu_ref_alloc(ref, pcpu_count);
}
}
/**
* percpu_ref_put - decrement a dynamic percpu refcount
*
* Returns true if the result is 0, otherwise false; only checks for the ref
* hitting 0 after percpu_ref_kill() has been called. Analagous to
* atomic_dec_and_test().
*/
int percpu_ref_put(struct percpu_ref *ref)
{
unsigned long pcpu_count;
uint64_t v;
int ret = 0;
rcu_read_lock();
pcpu_count = ACCESS_ONCE(ref->pcpu_count);
switch (REF_STATUS(pcpu_count)) {
case PCPU_REF_PTR:
/* for rcu - we're not using rcu_dereference() */
smp_read_barrier_depends();
__this_cpu_dec(*((unsigned __percpu *) pcpu_count));
break;
case PCPU_REF_NONE:
case PCPU_REF_DYING:
atomic64_dec(&ref->count);
break;
case PCPU_REF_DEAD:
v = atomic64_dec_return(&ref->count);
v &= PCPU_COUNT_MASK;
ret = v == 0;
break;
}
rcu_read_unlock();
return ret;
}
/**
* percpu_ref_kill - prepare a dynamic percpu refcount for teardown
*
* Must be called before dropping the initial ref, so that percpu_ref_put()
* knows to check for the refcount hitting 0. If the refcount was in percpu
* mode, converts it back to single atomic counter mode.
*
* Returns true the first time called on @ref and false if @ref is already
* shutting down, so it may be used by the caller for synchronizing other parts
* of a two stage shutdown.
*/
int percpu_ref_kill(struct percpu_ref *ref)
{
unsigned long old, new, status, pcpu_count;
pcpu_count = ACCESS_ONCE(ref->pcpu_count);
do {
status = REF_STATUS(pcpu_count);
switch (status) {
case PCPU_REF_PTR:
new = PCPU_REF_DYING;
break;
case PCPU_REF_NONE:
new = PCPU_REF_DEAD;
break;
case PCPU_REF_DYING:
case PCPU_REF_DEAD:
return 0;
}
old = pcpu_count;
pcpu_count = cmpxchg(&ref->pcpu_count, old, new);
} while (pcpu_count != old);
if (status == PCPU_REF_PTR) {
unsigned count = 0, cpu;
synchronize_rcu();
for_each_possible_cpu(cpu)
count += *per_cpu_ptr((unsigned __percpu *) pcpu_count, cpu);
pr_debug("global %lli pcpu %i",
atomic64_read(&ref->count) & PCPU_COUNT_MASK,
(int) count);
atomic64_add((int) count, &ref->count);
smp_wmb();
/* Between setting global count and setting PCPU_REF_DEAD */
ref->pcpu_count = PCPU_REF_DEAD;
free_percpu((unsigned __percpu *) pcpu_count);
}
return 1;
}
/**
* percpu_ref_dead - check if a dynamic percpu refcount is shutting down
*
* Returns true if percpu_ref_kill() has been called on @ref, false otherwise.
*/
int percpu_ref_dead(struct percpu_ref *ref)
{
unsigned status = REF_STATUS(ref->pcpu_count);
return status == PCPU_REF_DYING ||
status == PCPU_REF_DEAD;
}