|  | // SPDX-License-Identifier: GPL-2.0 | 
|  | #include <linux/slab.h> | 
|  | #include <linux/lockdep.h> | 
|  | #include <linux/sysfs.h> | 
|  | #include <linux/kobject.h> | 
|  | #include <linux/memory.h> | 
|  | #include <linux/memory-tiers.h> | 
|  | #include <linux/notifier.h> | 
|  | #include <linux/sched/sysctl.h> | 
|  |  | 
|  | #include "internal.h" | 
|  |  | 
|  | struct memory_tier { | 
|  | /* hierarchy of memory tiers */ | 
|  | struct list_head list; | 
|  | /* list of all memory types part of this tier */ | 
|  | struct list_head memory_types; | 
|  | /* | 
|  | * start value of abstract distance. memory tier maps | 
|  | * an abstract distance  range, | 
|  | * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE | 
|  | */ | 
|  | int adistance_start; | 
|  | struct device dev; | 
|  | /* All the nodes that are part of all the lower memory tiers. */ | 
|  | nodemask_t lower_tier_mask; | 
|  | }; | 
|  |  | 
|  | struct demotion_nodes { | 
|  | nodemask_t preferred; | 
|  | }; | 
|  |  | 
|  | struct node_memory_type_map { | 
|  | struct memory_dev_type *memtype; | 
|  | int map_count; | 
|  | }; | 
|  |  | 
|  | static DEFINE_MUTEX(memory_tier_lock); | 
|  | static LIST_HEAD(memory_tiers); | 
|  | /* | 
|  | * The list is used to store all memory types that are not created | 
|  | * by a device driver. | 
|  | */ | 
|  | static LIST_HEAD(default_memory_types); | 
|  | static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; | 
|  | struct memory_dev_type *default_dram_type; | 
|  | nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE; | 
|  |  | 
|  | static const struct bus_type memory_tier_subsys = { | 
|  | .name = "memory_tiering", | 
|  | .dev_name = "memory_tier", | 
|  | }; | 
|  |  | 
|  | #ifdef CONFIG_NUMA_BALANCING | 
|  | /** | 
|  | * folio_use_access_time - check if a folio reuses cpupid for page access time | 
|  | * @folio: folio to check | 
|  | * | 
|  | * folio's _last_cpupid field is repurposed by memory tiering. In memory | 
|  | * tiering mode, cpupid of slow memory folio (not toptier memory) is used to | 
|  | * record page access time. | 
|  | * | 
|  | * Return: the folio _last_cpupid is used to record page access time | 
|  | */ | 
|  | bool folio_use_access_time(struct folio *folio) | 
|  | { | 
|  | return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && | 
|  | !node_is_toptier(folio_nid(folio)); | 
|  | } | 
|  | #endif | 
|  |  | 
|  | #ifdef CONFIG_MIGRATION | 
|  | static int top_tier_adistance; | 
|  | /* | 
|  | * node_demotion[] examples: | 
|  | * | 
|  | * Example 1: | 
|  | * | 
|  | * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. | 
|  | * | 
|  | * node distances: | 
|  | * node   0    1    2    3 | 
|  | *    0  10   20   30   40 | 
|  | *    1  20   10   40   30 | 
|  | *    2  30   40   10   40 | 
|  | *    3  40   30   40   10 | 
|  | * | 
|  | * memory_tiers0 = 0-1 | 
|  | * memory_tiers1 = 2-3 | 
|  | * | 
|  | * node_demotion[0].preferred = 2 | 
|  | * node_demotion[1].preferred = 3 | 
|  | * node_demotion[2].preferred = <empty> | 
|  | * node_demotion[3].preferred = <empty> | 
|  | * | 
|  | * Example 2: | 
|  | * | 
|  | * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. | 
|  | * | 
|  | * node distances: | 
|  | * node   0    1    2 | 
|  | *    0  10   20   30 | 
|  | *    1  20   10   30 | 
|  | *    2  30   30   10 | 
|  | * | 
|  | * memory_tiers0 = 0-2 | 
|  | * | 
|  | * node_demotion[0].preferred = <empty> | 
|  | * node_demotion[1].preferred = <empty> | 
|  | * node_demotion[2].preferred = <empty> | 
|  | * | 
|  | * Example 3: | 
|  | * | 
|  | * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. | 
|  | * | 
|  | * node distances: | 
|  | * node   0    1    2 | 
|  | *    0  10   20   30 | 
|  | *    1  20   10   40 | 
|  | *    2  30   40   10 | 
|  | * | 
|  | * memory_tiers0 = 1 | 
|  | * memory_tiers1 = 0 | 
|  | * memory_tiers2 = 2 | 
|  | * | 
|  | * node_demotion[0].preferred = 2 | 
|  | * node_demotion[1].preferred = 0 | 
|  | * node_demotion[2].preferred = <empty> | 
|  | * | 
|  | */ | 
|  | static struct demotion_nodes *node_demotion __read_mostly; | 
|  | #endif /* CONFIG_MIGRATION */ | 
|  |  | 
|  | static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); | 
|  |  | 
|  | /* The lock is used to protect `default_dram_perf*` info and nid. */ | 
|  | static DEFINE_MUTEX(default_dram_perf_lock); | 
|  | static bool default_dram_perf_error; | 
|  | static struct access_coordinate default_dram_perf; | 
|  | static int default_dram_perf_ref_nid = NUMA_NO_NODE; | 
|  | static const char *default_dram_perf_ref_source; | 
|  |  | 
|  | static inline struct memory_tier *to_memory_tier(struct device *device) | 
|  | { | 
|  | return container_of(device, struct memory_tier, dev); | 
|  | } | 
|  |  | 
|  | static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) | 
|  | { | 
|  | nodemask_t nodes = NODE_MASK_NONE; | 
|  | struct memory_dev_type *memtype; | 
|  |  | 
|  | list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) | 
|  | nodes_or(nodes, nodes, memtype->nodes); | 
|  |  | 
|  | return nodes; | 
|  | } | 
|  |  | 
|  | static void memory_tier_device_release(struct device *dev) | 
|  | { | 
|  | struct memory_tier *tier = to_memory_tier(dev); | 
|  | /* | 
|  | * synchronize_rcu in clear_node_memory_tier makes sure | 
|  | * we don't have rcu access to this memory tier. | 
|  | */ | 
|  | kfree(tier); | 
|  | } | 
|  |  | 
|  | static ssize_t nodelist_show(struct device *dev, | 
|  | struct device_attribute *attr, char *buf) | 
|  | { | 
|  | int ret; | 
|  | nodemask_t nmask; | 
|  |  | 
|  | mutex_lock(&memory_tier_lock); | 
|  | nmask = get_memtier_nodemask(to_memory_tier(dev)); | 
|  | ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); | 
|  | mutex_unlock(&memory_tier_lock); | 
|  | return ret; | 
|  | } | 
|  | static DEVICE_ATTR_RO(nodelist); | 
|  |  | 
|  | static struct attribute *memtier_dev_attrs[] = { | 
|  | &dev_attr_nodelist.attr, | 
|  | NULL | 
|  | }; | 
|  |  | 
|  | static const struct attribute_group memtier_dev_group = { | 
|  | .attrs = memtier_dev_attrs, | 
|  | }; | 
|  |  | 
|  | static const struct attribute_group *memtier_dev_groups[] = { | 
|  | &memtier_dev_group, | 
|  | NULL | 
|  | }; | 
|  |  | 
|  | static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) | 
|  | { | 
|  | int ret; | 
|  | bool found_slot = false; | 
|  | struct memory_tier *memtier, *new_memtier; | 
|  | int adistance = memtype->adistance; | 
|  | unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; | 
|  |  | 
|  | lockdep_assert_held_once(&memory_tier_lock); | 
|  |  | 
|  | adistance = round_down(adistance, memtier_adistance_chunk_size); | 
|  | /* | 
|  | * If the memtype is already part of a memory tier, | 
|  | * just return that. | 
|  | */ | 
|  | if (!list_empty(&memtype->tier_sibling)) { | 
|  | list_for_each_entry(memtier, &memory_tiers, list) { | 
|  | if (adistance == memtier->adistance_start) | 
|  | return memtier; | 
|  | } | 
|  | WARN_ON(1); | 
|  | return ERR_PTR(-EINVAL); | 
|  | } | 
|  |  | 
|  | list_for_each_entry(memtier, &memory_tiers, list) { | 
|  | if (adistance == memtier->adistance_start) { | 
|  | goto link_memtype; | 
|  | } else if (adistance < memtier->adistance_start) { | 
|  | found_slot = true; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); | 
|  | if (!new_memtier) | 
|  | return ERR_PTR(-ENOMEM); | 
|  |  | 
|  | new_memtier->adistance_start = adistance; | 
|  | INIT_LIST_HEAD(&new_memtier->list); | 
|  | INIT_LIST_HEAD(&new_memtier->memory_types); | 
|  | if (found_slot) | 
|  | list_add_tail(&new_memtier->list, &memtier->list); | 
|  | else | 
|  | list_add_tail(&new_memtier->list, &memory_tiers); | 
|  |  | 
|  | new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; | 
|  | new_memtier->dev.bus = &memory_tier_subsys; | 
|  | new_memtier->dev.release = memory_tier_device_release; | 
|  | new_memtier->dev.groups = memtier_dev_groups; | 
|  |  | 
|  | ret = device_register(&new_memtier->dev); | 
|  | if (ret) { | 
|  | list_del(&new_memtier->list); | 
|  | put_device(&new_memtier->dev); | 
|  | return ERR_PTR(ret); | 
|  | } | 
|  | memtier = new_memtier; | 
|  |  | 
|  | link_memtype: | 
|  | list_add(&memtype->tier_sibling, &memtier->memory_types); | 
|  | return memtier; | 
|  | } | 
|  |  | 
|  | static struct memory_tier *__node_get_memory_tier(int node) | 
|  | { | 
|  | pg_data_t *pgdat; | 
|  |  | 
|  | pgdat = NODE_DATA(node); | 
|  | if (!pgdat) | 
|  | return NULL; | 
|  | /* | 
|  | * Since we hold memory_tier_lock, we can avoid | 
|  | * RCU read locks when accessing the details. No | 
|  | * parallel updates are possible here. | 
|  | */ | 
|  | return rcu_dereference_check(pgdat->memtier, | 
|  | lockdep_is_held(&memory_tier_lock)); | 
|  | } | 
|  |  | 
|  | #ifdef CONFIG_MIGRATION | 
|  | bool node_is_toptier(int node) | 
|  | { | 
|  | bool toptier; | 
|  | pg_data_t *pgdat; | 
|  | struct memory_tier *memtier; | 
|  |  | 
|  | pgdat = NODE_DATA(node); | 
|  | if (!pgdat) | 
|  | return false; | 
|  |  | 
|  | rcu_read_lock(); | 
|  | memtier = rcu_dereference(pgdat->memtier); | 
|  | if (!memtier) { | 
|  | toptier = true; | 
|  | goto out; | 
|  | } | 
|  | if (memtier->adistance_start <= top_tier_adistance) | 
|  | toptier = true; | 
|  | else | 
|  | toptier = false; | 
|  | out: | 
|  | rcu_read_unlock(); | 
|  | return toptier; | 
|  | } | 
|  |  | 
|  | void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) | 
|  | { | 
|  | struct memory_tier *memtier; | 
|  |  | 
|  | /* | 
|  | * pg_data_t.memtier updates includes a synchronize_rcu() | 
|  | * which ensures that we either find NULL or a valid memtier | 
|  | * in NODE_DATA. protect the access via rcu_read_lock(); | 
|  | */ | 
|  | rcu_read_lock(); | 
|  | memtier = rcu_dereference(pgdat->memtier); | 
|  | if (memtier) | 
|  | *targets = memtier->lower_tier_mask; | 
|  | else | 
|  | *targets = NODE_MASK_NONE; | 
|  | rcu_read_unlock(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * next_demotion_node() - Get the next node in the demotion path | 
|  | * @node: The starting node to lookup the next node | 
|  | * | 
|  | * Return: node id for next memory node in the demotion path hierarchy | 
|  | * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep | 
|  | * @node online or guarantee that it *continues* to be the next demotion | 
|  | * target. | 
|  | */ | 
|  | int next_demotion_node(int node) | 
|  | { | 
|  | struct demotion_nodes *nd; | 
|  | int target; | 
|  |  | 
|  | if (!node_demotion) | 
|  | return NUMA_NO_NODE; | 
|  |  | 
|  | nd = &node_demotion[node]; | 
|  |  | 
|  | /* | 
|  | * node_demotion[] is updated without excluding this | 
|  | * function from running. | 
|  | * | 
|  | * Make sure to use RCU over entire code blocks if | 
|  | * node_demotion[] reads need to be consistent. | 
|  | */ | 
|  | rcu_read_lock(); | 
|  | /* | 
|  | * If there are multiple target nodes, just select one | 
|  | * target node randomly. | 
|  | * | 
|  | * In addition, we can also use round-robin to select | 
|  | * target node, but we should introduce another variable | 
|  | * for node_demotion[] to record last selected target node, | 
|  | * that may cause cache ping-pong due to the changing of | 
|  | * last target node. Or introducing per-cpu data to avoid | 
|  | * caching issue, which seems more complicated. So selecting | 
|  | * target node randomly seems better until now. | 
|  | */ | 
|  | target = node_random(&nd->preferred); | 
|  | rcu_read_unlock(); | 
|  |  | 
|  | return target; | 
|  | } | 
|  |  | 
|  | static void disable_all_demotion_targets(void) | 
|  | { | 
|  | struct memory_tier *memtier; | 
|  | int node; | 
|  |  | 
|  | for_each_node_state(node, N_MEMORY) { | 
|  | node_demotion[node].preferred = NODE_MASK_NONE; | 
|  | /* | 
|  | * We are holding memory_tier_lock, it is safe | 
|  | * to access pgda->memtier. | 
|  | */ | 
|  | memtier = __node_get_memory_tier(node); | 
|  | if (memtier) | 
|  | memtier->lower_tier_mask = NODE_MASK_NONE; | 
|  | } | 
|  | /* | 
|  | * Ensure that the "disable" is visible across the system. | 
|  | * Readers will see either a combination of before+disable | 
|  | * state or disable+after.  They will never see before and | 
|  | * after state together. | 
|  | */ | 
|  | synchronize_rcu(); | 
|  | } | 
|  |  | 
|  | static void dump_demotion_targets(void) | 
|  | { | 
|  | int node; | 
|  |  | 
|  | for_each_node_state(node, N_MEMORY) { | 
|  | struct memory_tier *memtier = __node_get_memory_tier(node); | 
|  | nodemask_t preferred = node_demotion[node].preferred; | 
|  |  | 
|  | if (!memtier) | 
|  | continue; | 
|  |  | 
|  | if (nodes_empty(preferred)) | 
|  | pr_info("Demotion targets for Node %d: null\n", node); | 
|  | else | 
|  | pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", | 
|  | node, nodemask_pr_args(&preferred), | 
|  | nodemask_pr_args(&memtier->lower_tier_mask)); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Find an automatic demotion target for all memory | 
|  | * nodes. Failing here is OK.  It might just indicate | 
|  | * being at the end of a chain. | 
|  | */ | 
|  | static void establish_demotion_targets(void) | 
|  | { | 
|  | struct memory_tier *memtier; | 
|  | struct demotion_nodes *nd; | 
|  | int target = NUMA_NO_NODE, node; | 
|  | int distance, best_distance; | 
|  | nodemask_t tier_nodes, lower_tier; | 
|  |  | 
|  | lockdep_assert_held_once(&memory_tier_lock); | 
|  |  | 
|  | if (!node_demotion) | 
|  | return; | 
|  |  | 
|  | disable_all_demotion_targets(); | 
|  |  | 
|  | for_each_node_state(node, N_MEMORY) { | 
|  | best_distance = -1; | 
|  | nd = &node_demotion[node]; | 
|  |  | 
|  | memtier = __node_get_memory_tier(node); | 
|  | if (!memtier || list_is_last(&memtier->list, &memory_tiers)) | 
|  | continue; | 
|  | /* | 
|  | * Get the lower memtier to find the  demotion node list. | 
|  | */ | 
|  | memtier = list_next_entry(memtier, list); | 
|  | tier_nodes = get_memtier_nodemask(memtier); | 
|  | /* | 
|  | * find_next_best_node, use 'used' nodemask as a skip list. | 
|  | * Add all memory nodes except the selected memory tier | 
|  | * nodelist to skip list so that we find the best node from the | 
|  | * memtier nodelist. | 
|  | */ | 
|  | nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); | 
|  |  | 
|  | /* | 
|  | * Find all the nodes in the memory tier node list of same best distance. | 
|  | * add them to the preferred mask. We randomly select between nodes | 
|  | * in the preferred mask when allocating pages during demotion. | 
|  | */ | 
|  | do { | 
|  | target = find_next_best_node(node, &tier_nodes); | 
|  | if (target == NUMA_NO_NODE) | 
|  | break; | 
|  |  | 
|  | distance = node_distance(node, target); | 
|  | if (distance == best_distance || best_distance == -1) { | 
|  | best_distance = distance; | 
|  | node_set(target, nd->preferred); | 
|  | } else { | 
|  | break; | 
|  | } | 
|  | } while (1); | 
|  | } | 
|  | /* | 
|  | * Promotion is allowed from a memory tier to higher | 
|  | * memory tier only if the memory tier doesn't include | 
|  | * compute. We want to skip promotion from a memory tier, | 
|  | * if any node that is part of the memory tier have CPUs. | 
|  | * Once we detect such a memory tier, we consider that tier | 
|  | * as top tiper from which promotion is not allowed. | 
|  | */ | 
|  | list_for_each_entry_reverse(memtier, &memory_tiers, list) { | 
|  | tier_nodes = get_memtier_nodemask(memtier); | 
|  | nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); | 
|  | if (!nodes_empty(tier_nodes)) { | 
|  | /* | 
|  | * abstract distance below the max value of this memtier | 
|  | * is considered toptier. | 
|  | */ | 
|  | top_tier_adistance = memtier->adistance_start + | 
|  | MEMTIER_CHUNK_SIZE - 1; | 
|  | break; | 
|  | } | 
|  | } | 
|  | /* | 
|  | * Now build the lower_tier mask for each node collecting node mask from | 
|  | * all memory tier below it. This allows us to fallback demotion page | 
|  | * allocation to a set of nodes that is closer the above selected | 
|  | * preferred node. | 
|  | */ | 
|  | lower_tier = node_states[N_MEMORY]; | 
|  | list_for_each_entry(memtier, &memory_tiers, list) { | 
|  | /* | 
|  | * Keep removing current tier from lower_tier nodes, | 
|  | * This will remove all nodes in current and above | 
|  | * memory tier from the lower_tier mask. | 
|  | */ | 
|  | tier_nodes = get_memtier_nodemask(memtier); | 
|  | nodes_andnot(lower_tier, lower_tier, tier_nodes); | 
|  | memtier->lower_tier_mask = lower_tier; | 
|  | } | 
|  |  | 
|  | dump_demotion_targets(); | 
|  | } | 
|  |  | 
|  | #else | 
|  | static inline void establish_demotion_targets(void) {} | 
|  | #endif /* CONFIG_MIGRATION */ | 
|  |  | 
|  | static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) | 
|  | { | 
|  | if (!node_memory_types[node].memtype) | 
|  | node_memory_types[node].memtype = memtype; | 
|  | /* | 
|  | * for each device getting added in the same NUMA node | 
|  | * with this specific memtype, bump the map count. We | 
|  | * Only take memtype device reference once, so that | 
|  | * changing a node memtype can be done by droping the | 
|  | * only reference count taken here. | 
|  | */ | 
|  |  | 
|  | if (node_memory_types[node].memtype == memtype) { | 
|  | if (!node_memory_types[node].map_count++) | 
|  | kref_get(&memtype->kref); | 
|  | } | 
|  | } | 
|  |  | 
|  | static struct memory_tier *set_node_memory_tier(int node) | 
|  | { | 
|  | struct memory_tier *memtier; | 
|  | struct memory_dev_type *memtype = default_dram_type; | 
|  | int adist = MEMTIER_ADISTANCE_DRAM; | 
|  | pg_data_t *pgdat = NODE_DATA(node); | 
|  |  | 
|  |  | 
|  | lockdep_assert_held_once(&memory_tier_lock); | 
|  |  | 
|  | if (!node_state(node, N_MEMORY)) | 
|  | return ERR_PTR(-EINVAL); | 
|  |  | 
|  | mt_calc_adistance(node, &adist); | 
|  | if (!node_memory_types[node].memtype) { | 
|  | memtype = mt_find_alloc_memory_type(adist, &default_memory_types); | 
|  | if (IS_ERR(memtype)) { | 
|  | memtype = default_dram_type; | 
|  | pr_info("Failed to allocate a memory type. Fall back.\n"); | 
|  | } | 
|  | } | 
|  |  | 
|  | __init_node_memory_type(node, memtype); | 
|  |  | 
|  | memtype = node_memory_types[node].memtype; | 
|  | node_set(node, memtype->nodes); | 
|  | memtier = find_create_memory_tier(memtype); | 
|  | if (!IS_ERR(memtier)) | 
|  | rcu_assign_pointer(pgdat->memtier, memtier); | 
|  | return memtier; | 
|  | } | 
|  |  | 
|  | static void destroy_memory_tier(struct memory_tier *memtier) | 
|  | { | 
|  | list_del(&memtier->list); | 
|  | device_unregister(&memtier->dev); | 
|  | } | 
|  |  | 
|  | static bool clear_node_memory_tier(int node) | 
|  | { | 
|  | bool cleared = false; | 
|  | pg_data_t *pgdat; | 
|  | struct memory_tier *memtier; | 
|  |  | 
|  | pgdat = NODE_DATA(node); | 
|  | if (!pgdat) | 
|  | return false; | 
|  |  | 
|  | /* | 
|  | * Make sure that anybody looking at NODE_DATA who finds | 
|  | * a valid memtier finds memory_dev_types with nodes still | 
|  | * linked to the memtier. We achieve this by waiting for | 
|  | * rcu read section to finish using synchronize_rcu. | 
|  | * This also enables us to free the destroyed memory tier | 
|  | * with kfree instead of kfree_rcu | 
|  | */ | 
|  | memtier = __node_get_memory_tier(node); | 
|  | if (memtier) { | 
|  | struct memory_dev_type *memtype; | 
|  |  | 
|  | rcu_assign_pointer(pgdat->memtier, NULL); | 
|  | synchronize_rcu(); | 
|  | memtype = node_memory_types[node].memtype; | 
|  | node_clear(node, memtype->nodes); | 
|  | if (nodes_empty(memtype->nodes)) { | 
|  | list_del_init(&memtype->tier_sibling); | 
|  | if (list_empty(&memtier->memory_types)) | 
|  | destroy_memory_tier(memtier); | 
|  | } | 
|  | cleared = true; | 
|  | } | 
|  | return cleared; | 
|  | } | 
|  |  | 
|  | static void release_memtype(struct kref *kref) | 
|  | { | 
|  | struct memory_dev_type *memtype; | 
|  |  | 
|  | memtype = container_of(kref, struct memory_dev_type, kref); | 
|  | kfree(memtype); | 
|  | } | 
|  |  | 
|  | struct memory_dev_type *alloc_memory_type(int adistance) | 
|  | { | 
|  | struct memory_dev_type *memtype; | 
|  |  | 
|  | memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); | 
|  | if (!memtype) | 
|  | return ERR_PTR(-ENOMEM); | 
|  |  | 
|  | memtype->adistance = adistance; | 
|  | INIT_LIST_HEAD(&memtype->tier_sibling); | 
|  | memtype->nodes  = NODE_MASK_NONE; | 
|  | kref_init(&memtype->kref); | 
|  | return memtype; | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(alloc_memory_type); | 
|  |  | 
|  | void put_memory_type(struct memory_dev_type *memtype) | 
|  | { | 
|  | kref_put(&memtype->kref, release_memtype); | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(put_memory_type); | 
|  |  | 
|  | void init_node_memory_type(int node, struct memory_dev_type *memtype) | 
|  | { | 
|  |  | 
|  | mutex_lock(&memory_tier_lock); | 
|  | __init_node_memory_type(node, memtype); | 
|  | mutex_unlock(&memory_tier_lock); | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(init_node_memory_type); | 
|  |  | 
|  | void clear_node_memory_type(int node, struct memory_dev_type *memtype) | 
|  | { | 
|  | mutex_lock(&memory_tier_lock); | 
|  | if (node_memory_types[node].memtype == memtype || !memtype) | 
|  | node_memory_types[node].map_count--; | 
|  | /* | 
|  | * If we umapped all the attached devices to this node, | 
|  | * clear the node memory type. | 
|  | */ | 
|  | if (!node_memory_types[node].map_count) { | 
|  | memtype = node_memory_types[node].memtype; | 
|  | node_memory_types[node].memtype = NULL; | 
|  | put_memory_type(memtype); | 
|  | } | 
|  | mutex_unlock(&memory_tier_lock); | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(clear_node_memory_type); | 
|  |  | 
|  | struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) | 
|  | { | 
|  | struct memory_dev_type *mtype; | 
|  |  | 
|  | list_for_each_entry(mtype, memory_types, list) | 
|  | if (mtype->adistance == adist) | 
|  | return mtype; | 
|  |  | 
|  | mtype = alloc_memory_type(adist); | 
|  | if (IS_ERR(mtype)) | 
|  | return mtype; | 
|  |  | 
|  | list_add(&mtype->list, memory_types); | 
|  |  | 
|  | return mtype; | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); | 
|  |  | 
|  | void mt_put_memory_types(struct list_head *memory_types) | 
|  | { | 
|  | struct memory_dev_type *mtype, *mtn; | 
|  |  | 
|  | list_for_each_entry_safe(mtype, mtn, memory_types, list) { | 
|  | list_del(&mtype->list); | 
|  | put_memory_type(mtype); | 
|  | } | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(mt_put_memory_types); | 
|  |  | 
|  | /* | 
|  | * This is invoked via `late_initcall()` to initialize memory tiers for | 
|  | * memory nodes, both with and without CPUs. After the initialization of | 
|  | * firmware and devices, adistance algorithms are expected to be provided. | 
|  | */ | 
|  | static int __init memory_tier_late_init(void) | 
|  | { | 
|  | int nid; | 
|  | struct memory_tier *memtier; | 
|  |  | 
|  | get_online_mems(); | 
|  | guard(mutex)(&memory_tier_lock); | 
|  |  | 
|  | /* Assign each uninitialized N_MEMORY node to a memory tier. */ | 
|  | for_each_node_state(nid, N_MEMORY) { | 
|  | /* | 
|  | * Some device drivers may have initialized | 
|  | * memory tiers, potentially bringing memory nodes | 
|  | * online and configuring memory tiers. | 
|  | * Exclude them here. | 
|  | */ | 
|  | if (node_memory_types[nid].memtype) | 
|  | continue; | 
|  |  | 
|  | memtier = set_node_memory_tier(nid); | 
|  | if (IS_ERR(memtier)) | 
|  | continue; | 
|  | } | 
|  |  | 
|  | establish_demotion_targets(); | 
|  | put_online_mems(); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  | late_initcall(memory_tier_late_init); | 
|  |  | 
|  | static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) | 
|  | { | 
|  | pr_info( | 
|  | "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", | 
|  | prefix, coord->read_latency, coord->write_latency, | 
|  | coord->read_bandwidth, coord->write_bandwidth); | 
|  | } | 
|  |  | 
|  | int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, | 
|  | const char *source) | 
|  | { | 
|  | guard(mutex)(&default_dram_perf_lock); | 
|  | if (default_dram_perf_error) | 
|  | return -EIO; | 
|  |  | 
|  | if (perf->read_latency + perf->write_latency == 0 || | 
|  | perf->read_bandwidth + perf->write_bandwidth == 0) | 
|  | return -EINVAL; | 
|  |  | 
|  | if (default_dram_perf_ref_nid == NUMA_NO_NODE) { | 
|  | default_dram_perf = *perf; | 
|  | default_dram_perf_ref_nid = nid; | 
|  | default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * The performance of all default DRAM nodes is expected to be | 
|  | * same (that is, the variation is less than 10%).  And it | 
|  | * will be used as base to calculate the abstract distance of | 
|  | * other memory nodes. | 
|  | */ | 
|  | if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > | 
|  | default_dram_perf.read_latency || | 
|  | abs(perf->write_latency - default_dram_perf.write_latency) * 10 > | 
|  | default_dram_perf.write_latency || | 
|  | abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > | 
|  | default_dram_perf.read_bandwidth || | 
|  | abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > | 
|  | default_dram_perf.write_bandwidth) { | 
|  | pr_info( | 
|  | "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" | 
|  | "DRAM node %d.\n", nid, default_dram_perf_ref_nid); | 
|  | pr_info("  performance of reference DRAM node %d from %s:\n", | 
|  | default_dram_perf_ref_nid, default_dram_perf_ref_source); | 
|  | dump_hmem_attrs(&default_dram_perf, "    "); | 
|  | pr_info("  performance of DRAM node %d from %s:\n", nid, source); | 
|  | dump_hmem_attrs(perf, "    "); | 
|  | pr_info( | 
|  | "  disable default DRAM node performance based abstract distance algorithm.\n"); | 
|  | default_dram_perf_error = true; | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) | 
|  | { | 
|  | guard(mutex)(&default_dram_perf_lock); | 
|  | if (default_dram_perf_error) | 
|  | return -EIO; | 
|  |  | 
|  | if (perf->read_latency + perf->write_latency == 0 || | 
|  | perf->read_bandwidth + perf->write_bandwidth == 0) | 
|  | return -EINVAL; | 
|  |  | 
|  | if (default_dram_perf_ref_nid == NUMA_NO_NODE) | 
|  | return -ENOENT; | 
|  |  | 
|  | /* | 
|  | * The abstract distance of a memory node is in direct proportion to | 
|  | * its memory latency (read + write) and inversely proportional to its | 
|  | * memory bandwidth (read + write).  The abstract distance, memory | 
|  | * latency, and memory bandwidth of the default DRAM nodes are used as | 
|  | * the base. | 
|  | */ | 
|  | *adist = MEMTIER_ADISTANCE_DRAM * | 
|  | (perf->read_latency + perf->write_latency) / | 
|  | (default_dram_perf.read_latency + default_dram_perf.write_latency) * | 
|  | (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / | 
|  | (perf->read_bandwidth + perf->write_bandwidth); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(mt_perf_to_adistance); | 
|  |  | 
|  | /** | 
|  | * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm | 
|  | * @nb: The notifier block which describe the algorithm | 
|  | * | 
|  | * Return: 0 on success, errno on error. | 
|  | * | 
|  | * Every memory tiering abstract distance algorithm provider needs to | 
|  | * register the algorithm with register_mt_adistance_algorithm().  To | 
|  | * calculate the abstract distance for a specified memory node, the | 
|  | * notifier function will be called unless some high priority | 
|  | * algorithm has provided result.  The prototype of the notifier | 
|  | * function is as follows, | 
|  | * | 
|  | *   int (*algorithm_notifier)(struct notifier_block *nb, | 
|  | *                             unsigned long nid, void *data); | 
|  | * | 
|  | * Where "nid" specifies the memory node, "data" is the pointer to the | 
|  | * returned abstract distance (that is, "int *adist").  If the | 
|  | * algorithm provides the result, NOTIFY_STOP should be returned. | 
|  | * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next | 
|  | * algorithm in the chain to provide the result. | 
|  | */ | 
|  | int register_mt_adistance_algorithm(struct notifier_block *nb) | 
|  | { | 
|  | return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); | 
|  |  | 
|  | /** | 
|  | * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm | 
|  | * @nb: the notifier block which describe the algorithm | 
|  | * | 
|  | * Return: 0 on success, errno on error. | 
|  | */ | 
|  | int unregister_mt_adistance_algorithm(struct notifier_block *nb) | 
|  | { | 
|  | return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); | 
|  |  | 
|  | /** | 
|  | * mt_calc_adistance() - Calculate abstract distance with registered algorithms | 
|  | * @node: the node to calculate abstract distance for | 
|  | * @adist: the returned abstract distance | 
|  | * | 
|  | * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some | 
|  | * abstract distance algorithm provides the result, and return it via | 
|  | * @adist.  Otherwise, no algorithm can provide the result and @adist | 
|  | * will be kept as it is. | 
|  | */ | 
|  | int mt_calc_adistance(int node, int *adist) | 
|  | { | 
|  | return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(mt_calc_adistance); | 
|  |  | 
|  | static int __meminit memtier_hotplug_callback(struct notifier_block *self, | 
|  | unsigned long action, void *_arg) | 
|  | { | 
|  | struct memory_tier *memtier; | 
|  | struct node_notify *nn = _arg; | 
|  |  | 
|  | switch (action) { | 
|  | case NODE_REMOVED_LAST_MEMORY: | 
|  | mutex_lock(&memory_tier_lock); | 
|  | if (clear_node_memory_tier(nn->nid)) | 
|  | establish_demotion_targets(); | 
|  | mutex_unlock(&memory_tier_lock); | 
|  | break; | 
|  | case NODE_ADDED_FIRST_MEMORY: | 
|  | mutex_lock(&memory_tier_lock); | 
|  | memtier = set_node_memory_tier(nn->nid); | 
|  | if (!IS_ERR(memtier)) | 
|  | establish_demotion_targets(); | 
|  | mutex_unlock(&memory_tier_lock); | 
|  | break; | 
|  | } | 
|  |  | 
|  | return notifier_from_errno(0); | 
|  | } | 
|  |  | 
|  | static int __init memory_tier_init(void) | 
|  | { | 
|  | int ret; | 
|  |  | 
|  | ret = subsys_virtual_register(&memory_tier_subsys, NULL); | 
|  | if (ret) | 
|  | panic("%s() failed to register memory tier subsystem\n", __func__); | 
|  |  | 
|  | #ifdef CONFIG_MIGRATION | 
|  | node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), | 
|  | GFP_KERNEL); | 
|  | WARN_ON(!node_demotion); | 
|  | #endif | 
|  |  | 
|  | mutex_lock(&memory_tier_lock); | 
|  | /* | 
|  | * For now we can have 4 faster memory tiers with smaller adistance | 
|  | * than default DRAM tier. | 
|  | */ | 
|  | default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, | 
|  | &default_memory_types); | 
|  | mutex_unlock(&memory_tier_lock); | 
|  | if (IS_ERR(default_dram_type)) | 
|  | panic("%s() failed to allocate default DRAM tier\n", __func__); | 
|  |  | 
|  | /* Record nodes with memory and CPU to set default DRAM performance. */ | 
|  | nodes_and(default_dram_nodes, node_states[N_MEMORY], | 
|  | node_states[N_CPU]); | 
|  |  | 
|  | hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); | 
|  | return 0; | 
|  | } | 
|  | subsys_initcall(memory_tier_init); | 
|  |  | 
|  | bool numa_demotion_enabled = false; | 
|  |  | 
|  | #ifdef CONFIG_MIGRATION | 
|  | #ifdef CONFIG_SYSFS | 
|  | static ssize_t demotion_enabled_show(struct kobject *kobj, | 
|  | struct kobj_attribute *attr, char *buf) | 
|  | { | 
|  | return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled)); | 
|  | } | 
|  |  | 
|  | static ssize_t demotion_enabled_store(struct kobject *kobj, | 
|  | struct kobj_attribute *attr, | 
|  | const char *buf, size_t count) | 
|  | { | 
|  | ssize_t ret; | 
|  |  | 
|  | ret = kstrtobool(buf, &numa_demotion_enabled); | 
|  | if (ret) | 
|  | return ret; | 
|  |  | 
|  | return count; | 
|  | } | 
|  |  | 
|  | static struct kobj_attribute numa_demotion_enabled_attr = | 
|  | __ATTR_RW(demotion_enabled); | 
|  |  | 
|  | static struct attribute *numa_attrs[] = { | 
|  | &numa_demotion_enabled_attr.attr, | 
|  | NULL, | 
|  | }; | 
|  |  | 
|  | static const struct attribute_group numa_attr_group = { | 
|  | .attrs = numa_attrs, | 
|  | }; | 
|  |  | 
|  | static int __init numa_init_sysfs(void) | 
|  | { | 
|  | int err; | 
|  | struct kobject *numa_kobj; | 
|  |  | 
|  | numa_kobj = kobject_create_and_add("numa", mm_kobj); | 
|  | if (!numa_kobj) { | 
|  | pr_err("failed to create numa kobject\n"); | 
|  | return -ENOMEM; | 
|  | } | 
|  | err = sysfs_create_group(numa_kobj, &numa_attr_group); | 
|  | if (err) { | 
|  | pr_err("failed to register numa group\n"); | 
|  | goto delete_obj; | 
|  | } | 
|  | return 0; | 
|  |  | 
|  | delete_obj: | 
|  | kobject_put(numa_kobj); | 
|  | return err; | 
|  | } | 
|  | subsys_initcall(numa_init_sysfs); | 
|  | #endif /* CONFIG_SYSFS */ | 
|  | #endif |