blob: 074cae333fd270b82c3c7979616952cd391841af [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0
/*
* Copyright (C) 2022 Alibaba Cloud
*/
#include <stdlib.h>
#include "erofs/dedupe.h"
#include "erofs/print.h"
#include "rolling_hash.h"
#include "liberofs_xxhash.h"
#include "sha256.h"
unsigned long erofs_memcmp2(const u8 *s1, const u8 *s2,
unsigned long sz)
{
const unsigned long *a1, *a2;
unsigned long n = sz;
if (sz < sizeof(long))
goto out_bytes;
if (((long)s1 & (sizeof(long) - 1)) ==
((long)s2 & (sizeof(long) - 1))) {
while ((long)s1 & (sizeof(long) - 1)) {
if (*s1 != *s2)
break;
++s1;
++s2;
--sz;
}
a1 = (const unsigned long *)s1;
a2 = (const unsigned long *)s2;
while (sz >= sizeof(long)) {
if (*a1 != *a2)
break;
++a1;
++a2;
sz -= sizeof(long);
}
} else {
a1 = (const unsigned long *)s1;
a2 = (const unsigned long *)s2;
do {
if (get_unaligned(a1) != get_unaligned(a2))
break;
++a1;
++a2;
sz -= sizeof(long);
} while (sz >= sizeof(long));
}
s1 = (const u8 *)a1;
s2 = (const u8 *)a2;
out_bytes:
while (sz) {
if (*s1 != *s2)
break;
++s1;
++s2;
--sz;
}
return n - sz;
}
static unsigned int window_size, rollinghash_rm;
static struct list_head dedupe_tree[65536];
struct z_erofs_dedupe_item *dedupe_subtree;
struct z_erofs_dedupe_item {
struct list_head list;
struct z_erofs_dedupe_item *chain;
long long hash;
u8 prefix_sha256[32];
u64 prefix_xxh64;
erofs_off_t pstart;
unsigned int plen;
int original_length;
bool partial, raw;
u8 extra_data[];
};
int z_erofs_dedupe_match(struct z_erofs_dedupe_ctx *ctx)
{
struct z_erofs_dedupe_item e_find;
u8 *cur;
bool initial = true;
if (!window_size)
return -ENOENT;
if (ctx->cur > ctx->end - window_size)
cur = ctx->end - window_size;
else
cur = ctx->cur;
/* move backward byte-by-byte */
for (; cur >= ctx->start; --cur) {
struct list_head *p;
struct z_erofs_dedupe_item *e;
unsigned int extra = 0;
u64 xxh64_csum = 0;
u8 sha256[32];
if (initial) {
/* initial try */
e_find.hash = erofs_rolling_hash_init(cur, window_size, true);
initial = false;
} else {
e_find.hash = erofs_rolling_hash_advance(e_find.hash,
rollinghash_rm, cur[window_size], cur[0]);
}
p = &dedupe_tree[e_find.hash & (ARRAY_SIZE(dedupe_tree) - 1)];
list_for_each_entry(e, p, list) {
if (e->hash != e_find.hash)
continue;
if (!extra) {
xxh64_csum = xxh64(cur, window_size, 0);
extra = 1;
}
if (e->prefix_xxh64 == xxh64_csum)
break;
}
if (&e->list == p)
continue;
erofs_sha256(cur, window_size, sha256);
if (memcmp(sha256, e->prefix_sha256, sizeof(sha256)))
continue;
extra = min_t(unsigned int, ctx->end - cur - window_size,
e->original_length - window_size);
extra = erofs_memcmp2(cur + window_size, e->extra_data, extra);
if (window_size + extra <= ctx->cur - cur)
continue;
ctx->cur = cur;
ctx->e.length = window_size + extra;
ctx->e.partial = e->partial ||
(window_size + extra < e->original_length);
ctx->e.raw = e->raw;
ctx->e.inlined = false;
ctx->e.pstart = e->pstart;
ctx->e.plen = e->plen;
return 0;
}
return -ENOENT;
}
int z_erofs_dedupe_insert(struct z_erofs_inmem_extent *e,
void *original_data)
{
struct list_head *p;
struct z_erofs_dedupe_item *di, *k;
if (!window_size || e->length < window_size)
return 0;
di = malloc(sizeof(*di) + e->length - window_size);
if (!di)
return -ENOMEM;
di->original_length = e->length;
erofs_sha256(original_data, window_size, di->prefix_sha256);
di->prefix_xxh64 = xxh64(original_data, window_size, 0);
di->hash = erofs_rolling_hash_init(original_data,
window_size, true);
memcpy(di->extra_data, original_data + window_size,
e->length - window_size);
di->pstart = e->pstart;
di->plen = e->plen;
di->partial = e->partial;
di->raw = e->raw;
/* skip the same xxh64 hash */
p = &dedupe_tree[di->hash & (ARRAY_SIZE(dedupe_tree) - 1)];
list_for_each_entry(k, p, list) {
if (k->prefix_xxh64 == di->prefix_xxh64) {
free(di);
return 0;
}
}
di->chain = dedupe_subtree;
dedupe_subtree = di;
list_add_tail(&di->list, p);
return 0;
}
void z_erofs_dedupe_commit(bool drop)
{
if (!dedupe_subtree)
return;
if (drop) {
struct z_erofs_dedupe_item *di, *n;
for (di = dedupe_subtree; di; di = n) {
n = di->chain;
list_del(&di->list);
free(di);
}
}
dedupe_subtree = NULL;
}
int z_erofs_dedupe_init(unsigned int wsiz)
{
struct list_head *p;
for (p = dedupe_tree;
p < dedupe_tree + ARRAY_SIZE(dedupe_tree); ++p)
init_list_head(p);
window_size = wsiz;
rollinghash_rm = erofs_rollinghash_calc_rm(window_size);
return 0;
}
void z_erofs_dedupe_exit(void)
{
struct z_erofs_dedupe_item *di, *n;
struct list_head *p;
if (!window_size)
return;
z_erofs_dedupe_commit(true);
for (p = dedupe_tree;
p < dedupe_tree + ARRAY_SIZE(dedupe_tree); ++p) {
list_for_each_entry_safe(di, n, p, list) {
list_del(&di->list);
free(di);
}
}
dedupe_subtree = NULL;
}