// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022, Alibaba Cloud * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ #include #include "internal.h" static DEFINE_MUTEX(erofs_domain_list_lock); static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); static struct vfsmount *erofs_pseudo_mnt; static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, loff_t start, size_t len) { struct netfs_io_request *rreq; rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); if (!rreq) return ERR_PTR(-ENOMEM); rreq->start = start; rreq->len = len; rreq->mapping = mapping; rreq->inode = mapping->host; INIT_LIST_HEAD(&rreq->subrequests); refcount_set(&rreq->ref, 1); return rreq; } static void erofs_fscache_put_request(struct netfs_io_request *rreq) { if (!refcount_dec_and_test(&rreq->ref)) return; if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); kfree(rreq); } static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq) { if (!refcount_dec_and_test(&subreq->ref)) return; erofs_fscache_put_request(subreq->rreq); kfree(subreq); } static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; while (!list_empty(&rreq->subrequests)) { subreq = list_first_entry(&rreq->subrequests, struct netfs_io_subrequest, rreq_link); list_del(&subreq->rreq_link); erofs_fscache_put_subrequest(subreq); } } static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct folio *folio; unsigned int iopos = 0; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; bool subreq_failed = false; XA_STATE(xas, &rreq->mapping->i_pages, start_page); subreq = list_first_entry(&rreq->subrequests, struct netfs_io_subrequest, rreq_link); subreq_failed = (subreq->error < 0); rcu_read_lock(); xas_for_each(&xas, folio, last_page) { unsigned int pgpos, pgend; bool pg_failed = false; if (xas_retry(&xas, folio)) continue; pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; pgend = pgpos + folio_size(folio); for (;;) { if (!subreq) { pg_failed = true; break; } pg_failed |= subreq_failed; if (pgend < iopos + subreq->len) break; iopos += subreq->len; if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { subreq = list_next_entry(subreq, rreq_link); subreq_failed = (subreq->error < 0); } else { subreq = NULL; subreq_failed = false; } if (pgend == iopos) break; } if (!pg_failed) folio_mark_uptodate(folio); folio_unlock(folio); } rcu_read_unlock(); } static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq) { erofs_fscache_rreq_unlock_folios(rreq); erofs_fscache_clear_subrequests(rreq); erofs_fscache_put_request(rreq); } static void erofc_fscache_subreq_complete(void *priv, ssize_t transferred_or_error, bool was_async) { struct netfs_io_subrequest *subreq = priv; struct netfs_io_request *rreq = subreq->rreq; if (IS_ERR_VALUE(transferred_or_error)) subreq->error = transferred_or_error; if (atomic_dec_and_test(&rreq->nr_outstanding)) erofs_fscache_rreq_complete(rreq); erofs_fscache_put_subrequest(subreq); } /* * Read data from fscache and fill the read data into page cache described by * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes * the start physical address in the cache file. */ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, struct netfs_io_request *rreq, loff_t pstart) { enum netfs_io_source source; struct super_block *sb = rreq->mapping->host->i_sb; struct netfs_io_subrequest *subreq; struct netfs_cache_resources *cres = &rreq->cache_resources; struct iov_iter iter; loff_t start = rreq->start; size_t len = rreq->len; size_t done = 0; int ret; atomic_set(&rreq->nr_outstanding, 1); ret = fscache_begin_read_operation(cres, cookie); if (ret) goto out; while (done < len) { subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL); if (subreq) { INIT_LIST_HEAD(&subreq->rreq_link); refcount_set(&subreq->ref, 2); subreq->rreq = rreq; refcount_inc(&rreq->ref); } else { ret = -ENOMEM; goto out; } subreq->start = pstart + done; subreq->len = len - done; subreq->flags = 1 << NETFS_SREQ_ONDEMAND; list_add_tail(&subreq->rreq_link, &rreq->subrequests); source = cres->ops->prepare_read(subreq, LLONG_MAX); if (WARN_ON(subreq->len == 0)) source = NETFS_INVALID_READ; if (source != NETFS_READ_FROM_CACHE) { erofs_err(sb, "failed to fscache prepare_read (source %d)", source); ret = -EIO; subreq->error = ret; erofs_fscache_put_subrequest(subreq); goto out; } atomic_inc(&rreq->nr_outstanding); iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, start + done, subreq->len); ret = fscache_read(cres, subreq->start, &iter, NETFS_READ_HOLE_FAIL, erofc_fscache_subreq_complete, subreq); if (ret == -EIOCBQUEUED) ret = 0; if (ret) { erofs_err(sb, "failed to fscache_read (ret %d)", ret); goto out; } done += subreq->len; } out: if (atomic_dec_and_test(&rreq->nr_outstanding)) erofs_fscache_rreq_complete(rreq); return ret; } static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; struct super_block *sb = folio_mapping(folio)->host->i_sb; struct netfs_io_request *rreq; struct erofs_map_dev mdev = { .m_deviceid = 0, .m_pa = folio_pos(folio), }; ret = erofs_map_dev(sb, &mdev); if (ret) goto out; rreq = erofs_fscache_alloc_request(folio_mapping(folio), folio_pos(folio), folio_size(folio)); if (IS_ERR(rreq)) { ret = PTR_ERR(rreq); goto out; } return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, rreq, mdev.m_pa); out: folio_unlock(folio); return ret; } /* * Read into page cache in the range described by (@pos, @len). * * On return, the caller is responsible for page unlocking if the output @unlock * is true, or the callee will take this responsibility through netfs_io_request * interface. * * The return value is the number of bytes successfully handled, or negative * error code on failure. The only exception is that, the length of the range * instead of the error code is returned on failure after netfs_io_request is * allocated, so that .readahead() could advance rac accordingly. */ static int erofs_fscache_data_read(struct address_space *mapping, loff_t pos, size_t len, bool *unlock) { struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct netfs_io_request *rreq; struct erofs_map_blocks map; struct erofs_map_dev mdev; struct iov_iter iter; size_t count; int ret; *unlock = true; map.m_la = pos; ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); if (ret) return ret; if (map.m_flags & EROFS_MAP_META) { struct erofs_buf buf = __EROFS_BUF_INITIALIZER; erofs_blk_t blknr; size_t offset, size; void *src; /* For tail packing layout, the offset may be non-zero. */ offset = erofs_blkoff(sb, map.m_pa); blknr = erofs_blknr(sb, map.m_pa); size = map.m_llen; src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); if (IS_ERR(src)) return PTR_ERR(src); iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE); if (copy_to_iter(src + offset, size, &iter) != size) { erofs_put_metabuf(&buf); return -EFAULT; } iov_iter_zero(PAGE_SIZE - size, &iter); erofs_put_metabuf(&buf); return PAGE_SIZE; } if (!(map.m_flags & EROFS_MAP_MAPPED)) { count = len; iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); return count; } count = min_t(size_t, map.m_llen - (pos - map.m_la), len); DBG_BUGON(!count || count % PAGE_SIZE); mdev = (struct erofs_map_dev) { .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, }; ret = erofs_map_dev(sb, &mdev); if (ret) return ret; rreq = erofs_fscache_alloc_request(mapping, pos, count); if (IS_ERR(rreq)) return PTR_ERR(rreq); *unlock = false; erofs_fscache_read_folios_async(mdev.m_fscache->cookie, rreq, mdev.m_pa + (pos - map.m_la)); return count; } static int erofs_fscache_read_folio(struct file *file, struct folio *folio) { bool unlock; int ret; ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio), folio_size(folio), &unlock); if (unlock) { if (ret > 0) folio_mark_uptodate(folio); folio_unlock(folio); } return ret < 0 ? ret : 0; } static void erofs_fscache_readahead(struct readahead_control *rac) { struct folio *folio; size_t len, done = 0; loff_t start, pos; bool unlock; int ret, size; if (!readahead_count(rac)) return; start = readahead_pos(rac); len = readahead_length(rac); do { pos = start + done; ret = erofs_fscache_data_read(rac->mapping, pos, len - done, &unlock); if (ret <= 0) return; size = ret; while (size) { folio = readahead_folio(rac); size -= folio_size(folio); if (unlock) { folio_mark_uptodate(folio); folio_unlock(folio); } } } while ((done += ret) < len); } static const struct address_space_operations erofs_fscache_meta_aops = { .read_folio = erofs_fscache_meta_read_folio, }; const struct address_space_operations erofs_fscache_access_aops = { .read_folio = erofs_fscache_read_folio, .readahead = erofs_fscache_readahead, }; static void erofs_fscache_domain_put(struct erofs_domain *domain) { if (!domain) return; mutex_lock(&erofs_domain_list_lock); if (refcount_dec_and_test(&domain->ref)) { list_del(&domain->list); if (list_empty(&erofs_domain_list)) { kern_unmount(erofs_pseudo_mnt); erofs_pseudo_mnt = NULL; } fscache_relinquish_volume(domain->volume, NULL, false); mutex_unlock(&erofs_domain_list_lock); kfree(domain->domain_id); kfree(domain); return; } mutex_unlock(&erofs_domain_list_lock); } static int erofs_fscache_register_volume(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); char *domain_id = sbi->domain_id; struct fscache_volume *volume; char *name; int ret = 0; name = kasprintf(GFP_KERNEL, "erofs,%s", domain_id ? domain_id : sbi->fsid); if (!name) return -ENOMEM; volume = fscache_acquire_volume(name, NULL, NULL, 0); if (IS_ERR_OR_NULL(volume)) { erofs_err(sb, "failed to register volume for %s", name); ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; volume = NULL; } sbi->volume = volume; kfree(name); return ret; } static int erofs_fscache_init_domain(struct super_block *sb) { int err; struct erofs_domain *domain; struct erofs_sb_info *sbi = EROFS_SB(sb); domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL); if (!domain) return -ENOMEM; domain->domain_id = kstrdup(sbi->domain_id, GFP_KERNEL); if (!domain->domain_id) { kfree(domain); return -ENOMEM; } err = erofs_fscache_register_volume(sb); if (err) goto out; if (!erofs_pseudo_mnt) { erofs_pseudo_mnt = kern_mount(&erofs_fs_type); if (IS_ERR(erofs_pseudo_mnt)) { err = PTR_ERR(erofs_pseudo_mnt); goto out; } } domain->volume = sbi->volume; refcount_set(&domain->ref, 1); list_add(&domain->list, &erofs_domain_list); sbi->domain = domain; return 0; out: kfree(domain->domain_id); kfree(domain); return err; } static int erofs_fscache_register_domain(struct super_block *sb) { int err; struct erofs_domain *domain; struct erofs_sb_info *sbi = EROFS_SB(sb); mutex_lock(&erofs_domain_list_lock); list_for_each_entry(domain, &erofs_domain_list, list) { if (!strcmp(domain->domain_id, sbi->domain_id)) { sbi->domain = domain; sbi->volume = domain->volume; refcount_inc(&domain->ref); mutex_unlock(&erofs_domain_list_lock); return 0; } } err = erofs_fscache_init_domain(sb); mutex_unlock(&erofs_domain_list_lock); return err; } static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, char *name, unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; struct fscache_cookie *cookie; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); if (!cookie) { erofs_err(sb, "failed to get cookie for %s", name); ret = -EINVAL; goto err; } fscache_use_cookie(cookie, false); ctx->cookie = cookie; if (flags & EROFS_REG_COOKIE_NEED_INODE) { struct inode *const inode = new_inode(sb); if (!inode) { erofs_err(sb, "failed to get anon inode for %s", name); ret = -ENOMEM; goto err_cookie; } set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); inode->i_blkbits = EROFS_SB(sb)->blkszbits; ctx->inode = inode; } return ctx; err_cookie: fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); err: kfree(ctx); return ERR_PTR(ret); } static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) { fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); iput(ctx->inode); kfree(ctx->name); kfree(ctx); } static struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, char *name, unsigned int flags) { int err; struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; ctx = erofs_fscache_acquire_cookie(sb, name, flags); if (IS_ERR(ctx)) return ctx; ctx->name = kstrdup(name, GFP_KERNEL); if (!ctx->name) { err = -ENOMEM; goto out; } inode = new_inode(erofs_pseudo_mnt->mnt_sb); if (!inode) { err = -ENOMEM; goto out; } ctx->domain = domain; ctx->anon_inode = inode; inode->i_private = ctx; refcount_inc(&domain->ref); return ctx; out: erofs_fscache_relinquish_cookie(ctx); return ERR_PTR(err); } static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, char *name, unsigned int flags) { struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; struct super_block *psb = erofs_pseudo_mnt->mnt_sb; mutex_lock(&erofs_domain_cookies_lock); spin_lock(&psb->s_inode_list_lock); list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { ctx = inode->i_private; if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) continue; if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { igrab(inode); } else { erofs_err(sb, "%s already exists in domain %s", name, domain->domain_id); ctx = ERR_PTR(-EEXIST); } spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } spin_unlock(&psb->s_inode_list_lock); ctx = erofs_fscache_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, char *name, unsigned int flags) { if (EROFS_SB(sb)->domain_id) return erofs_domain_register_cookie(sb, name, flags); return erofs_fscache_acquire_cookie(sb, name, flags); } void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) { bool drop; struct erofs_domain *domain; if (!ctx) return; domain = ctx->domain; if (domain) { mutex_lock(&erofs_domain_cookies_lock); drop = atomic_read(&ctx->anon_inode->i_count) == 1; iput(ctx->anon_inode); mutex_unlock(&erofs_domain_cookies_lock); if (!drop) return; } erofs_fscache_relinquish_cookie(ctx); erofs_fscache_domain_put(domain); } int erofs_fscache_register_fs(struct super_block *sb) { int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; unsigned int flags; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); else ret = erofs_fscache_register_volume(sb); if (ret) return ret; /* * When shared domain is enabled, using NEED_NOEXIST to guarantee * the primary data blob (aka fsid) is unique in the shared domain. * * For non-shared-domain case, fscache_acquire_volume() invoked by * erofs_fscache_register_volume() has already guaranteed * the uniqueness of primary data blob. * * Acquired domain/volume will be relinquished in kill_sb() on error. */ flags = EROFS_REG_COOKIE_NEED_INODE; if (sbi->domain_id) flags |= EROFS_REG_COOKIE_NEED_NOEXIST; fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); if (IS_ERR(fscache)) return PTR_ERR(fscache); sbi->s_fscache = fscache; return 0; } void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); erofs_fscache_unregister_cookie(sbi->s_fscache); if (sbi->domain) erofs_fscache_domain_put(sbi->domain); else fscache_relinquish_volume(sbi->volume, NULL, false); sbi->s_fscache = NULL; sbi->volume = NULL; sbi->domain = NULL; }