// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 /* * Copyright (C) 2022 Alibaba Cloud */ #include "erofs/dedupe.h" #include "erofs/print.h" #include "rb_tree.h" #include "rolling_hash.h" #include "sha256.h" unsigned long erofs_memcmp2(const u8 *s1, const u8 *s2, unsigned long sz) { const unsigned long *a1, *a2; unsigned long n = sz; if (sz < sizeof(long)) goto out_bytes; if (((long)s1 & (sizeof(long) - 1)) == ((long)s2 & (sizeof(long) - 1))) { while ((long)s1 & (sizeof(long) - 1)) { if (*s1 != *s2) break; ++s1; ++s2; --sz; } a1 = (const unsigned long *)s1; a2 = (const unsigned long *)s2; while (sz >= sizeof(long)) { if (*a1 != *a2) break; ++a1; ++a2; sz -= sizeof(long); } } else { a1 = (const unsigned long *)s1; a2 = (const unsigned long *)s2; do { if (get_unaligned(a1) != get_unaligned(a2)) break; ++a1; ++a2; sz -= sizeof(long); } while (sz >= sizeof(long)); } s1 = (const u8 *)a1; s2 = (const u8 *)a2; out_bytes: while (sz) { if (*s1 != *s2) break; ++s1; ++s2; --sz; } return n - sz; } static unsigned int window_size, rollinghash_rm; static struct rb_tree *dedupe_tree, *dedupe_subtree; struct z_erofs_dedupe_item { long long hash; u8 prefix_sha256[32]; erofs_blk_t compressed_blkaddr; unsigned int compressed_blks; int original_length; bool partial, raw; u8 extra_data[]; }; static int z_erofs_dedupe_rbtree_cmp(struct rb_tree *self, struct rb_node *node_a, struct rb_node *node_b) { struct z_erofs_dedupe_item *e_a = node_a->value; struct z_erofs_dedupe_item *e_b = node_b->value; return (e_a->hash > e_b->hash) - (e_a->hash < e_b->hash); } int z_erofs_dedupe_match(struct z_erofs_dedupe_ctx *ctx) { struct z_erofs_dedupe_item e_find; u8 *cur; bool initial = true; if (!dedupe_tree) return -ENOENT; if (ctx->cur > ctx->end - window_size) cur = ctx->end - window_size; else cur = ctx->cur; /* move backward byte-by-byte */ for (; cur >= ctx->start; --cur) { struct z_erofs_dedupe_item *e; unsigned int extra; u8 sha256[32]; if (initial) { /* initial try */ e_find.hash = erofs_rolling_hash_init(cur, window_size, true); initial = false; } else { e_find.hash = erofs_rolling_hash_advance(e_find.hash, rollinghash_rm, cur[window_size], cur[0]); } e = rb_tree_find(dedupe_tree, &e_find); if (!e) { e = rb_tree_find(dedupe_subtree, &e_find); if (!e) continue; } erofs_sha256(cur, window_size, sha256); if (memcmp(sha256, e->prefix_sha256, sizeof(sha256))) continue; extra = min_t(unsigned int, ctx->end - cur - window_size, e->original_length - window_size); extra = erofs_memcmp2(cur + window_size, e->extra_data, extra); if (window_size + extra <= ctx->cur - cur) continue; ctx->cur = cur; ctx->e.length = window_size + extra; ctx->e.partial = e->partial || (window_size + extra < e->original_length); ctx->e.raw = e->raw; ctx->e.blkaddr = e->compressed_blkaddr; ctx->e.compressedblks = e->compressed_blks; return 0; } return -ENOENT; } int z_erofs_dedupe_insert(struct z_erofs_inmem_extent *e, void *original_data) { struct z_erofs_dedupe_item *di; if (!dedupe_subtree || e->length < window_size) return 0; di = malloc(sizeof(*di) + e->length - window_size); if (!di) return -ENOMEM; di->original_length = e->length; erofs_sha256(original_data, window_size, di->prefix_sha256); di->hash = erofs_rolling_hash_init(original_data, window_size, true); memcpy(di->extra_data, original_data + window_size, e->length - window_size); di->compressed_blkaddr = e->blkaddr; di->compressed_blks = e->compressedblks; di->partial = e->partial; di->raw = e->raw; /* with the same rolling hash */ if (!rb_tree_insert(dedupe_subtree, di)) free(di); return 0; } static void z_erofs_dedupe_node_free_cb(struct rb_tree *self, struct rb_node *node) { free(node->value); rb_tree_node_dealloc_cb(self, node); } void z_erofs_dedupe_commit(bool drop) { if (!dedupe_subtree) return; if (!drop) { struct rb_iter iter; struct z_erofs_dedupe_item *di; di = rb_iter_first(&iter, dedupe_subtree); while (di) { if (!rb_tree_insert(dedupe_tree, di)) DBG_BUGON(1); di = rb_iter_next(&iter); } /*rb_iter_dealloc(iter);*/ rb_tree_dealloc(dedupe_subtree, rb_tree_node_dealloc_cb); } else { rb_tree_dealloc(dedupe_subtree, z_erofs_dedupe_node_free_cb); } dedupe_subtree = rb_tree_create(z_erofs_dedupe_rbtree_cmp); } int z_erofs_dedupe_init(unsigned int wsiz) { dedupe_tree = rb_tree_create(z_erofs_dedupe_rbtree_cmp); if (!dedupe_tree) return -ENOMEM; dedupe_subtree = rb_tree_create(z_erofs_dedupe_rbtree_cmp); if (!dedupe_subtree) { rb_tree_dealloc(dedupe_subtree, NULL); return -ENOMEM; } window_size = wsiz; rollinghash_rm = erofs_rollinghash_calc_rm(window_size); return 0; } void z_erofs_dedupe_exit(void) { z_erofs_dedupe_commit(true); rb_tree_dealloc(dedupe_subtree, NULL); rb_tree_dealloc(dedupe_tree, z_erofs_dedupe_node_free_cb); }