1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2008 Red Hat. All rights reserved.
4 */
5
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include "ctree.h"
15 #include "free-space-cache.h"
16 #include "transaction.h"
17 #include "disk-io.h"
18 #include "extent_io.h"
19 #include "inode-map.h"
20 #include "volumes.h"
21 #include "space-info.h"
22 #include "delalloc-space.h"
23 #include "block-group.h"
24 #include "discard.h"
25
26 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
27 #define MAX_CACHE_BYTES_PER_GIG SZ_64K
28 #define FORCE_EXTENT_THRESHOLD SZ_1M
29
30 struct btrfs_trim_range {
31 u64 start;
32 u64 bytes;
33 struct list_head list;
34 };
35
36 static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
37 struct btrfs_free_space *bitmap_info);
38 static int link_free_space(struct btrfs_free_space_ctl *ctl,
39 struct btrfs_free_space *info);
40 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
41 struct btrfs_free_space *info);
42 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
43 struct btrfs_trans_handle *trans,
44 struct btrfs_io_ctl *io_ctl,
45 struct btrfs_path *path);
46
__lookup_free_space_inode(struct btrfs_root * root,struct btrfs_path * path,u64 offset)47 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
48 struct btrfs_path *path,
49 u64 offset)
50 {
51 struct btrfs_fs_info *fs_info = root->fs_info;
52 struct btrfs_key key;
53 struct btrfs_key location;
54 struct btrfs_disk_key disk_key;
55 struct btrfs_free_space_header *header;
56 struct extent_buffer *leaf;
57 struct inode *inode = NULL;
58 unsigned nofs_flag;
59 int ret;
60
61 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
62 key.offset = offset;
63 key.type = 0;
64
65 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
66 if (ret < 0)
67 return ERR_PTR(ret);
68 if (ret > 0) {
69 btrfs_release_path(path);
70 return ERR_PTR(-ENOENT);
71 }
72
73 leaf = path->nodes[0];
74 header = btrfs_item_ptr(leaf, path->slots[0],
75 struct btrfs_free_space_header);
76 btrfs_free_space_key(leaf, header, &disk_key);
77 btrfs_disk_key_to_cpu(&location, &disk_key);
78 btrfs_release_path(path);
79
80 /*
81 * We are often under a trans handle at this point, so we need to make
82 * sure NOFS is set to keep us from deadlocking.
83 */
84 nofs_flag = memalloc_nofs_save();
85 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
86 btrfs_release_path(path);
87 memalloc_nofs_restore(nofs_flag);
88 if (IS_ERR(inode))
89 return inode;
90
91 mapping_set_gfp_mask(inode->i_mapping,
92 mapping_gfp_constraint(inode->i_mapping,
93 ~(__GFP_FS | __GFP_HIGHMEM)));
94
95 return inode;
96 }
97
lookup_free_space_inode(struct btrfs_block_group * block_group,struct btrfs_path * path)98 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
99 struct btrfs_path *path)
100 {
101 struct btrfs_fs_info *fs_info = block_group->fs_info;
102 struct inode *inode = NULL;
103 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
104
105 spin_lock(&block_group->lock);
106 if (block_group->inode)
107 inode = igrab(block_group->inode);
108 spin_unlock(&block_group->lock);
109 if (inode)
110 return inode;
111
112 inode = __lookup_free_space_inode(fs_info->tree_root, path,
113 block_group->start);
114 if (IS_ERR(inode))
115 return inode;
116
117 spin_lock(&block_group->lock);
118 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
119 btrfs_info(fs_info, "Old style space inode found, converting.");
120 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
121 BTRFS_INODE_NODATACOW;
122 block_group->disk_cache_state = BTRFS_DC_CLEAR;
123 }
124
125 if (!block_group->iref) {
126 block_group->inode = igrab(inode);
127 block_group->iref = 1;
128 }
129 spin_unlock(&block_group->lock);
130
131 return inode;
132 }
133
__create_free_space_inode(struct btrfs_root * root,struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 ino,u64 offset)134 static int __create_free_space_inode(struct btrfs_root *root,
135 struct btrfs_trans_handle *trans,
136 struct btrfs_path *path,
137 u64 ino, u64 offset)
138 {
139 struct btrfs_key key;
140 struct btrfs_disk_key disk_key;
141 struct btrfs_free_space_header *header;
142 struct btrfs_inode_item *inode_item;
143 struct extent_buffer *leaf;
144 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
145 int ret;
146
147 ret = btrfs_insert_empty_inode(trans, root, path, ino);
148 if (ret)
149 return ret;
150
151 /* We inline crc's for the free disk space cache */
152 if (ino != BTRFS_FREE_INO_OBJECTID)
153 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
154
155 leaf = path->nodes[0];
156 inode_item = btrfs_item_ptr(leaf, path->slots[0],
157 struct btrfs_inode_item);
158 btrfs_item_key(leaf, &disk_key, path->slots[0]);
159 memzero_extent_buffer(leaf, (unsigned long)inode_item,
160 sizeof(*inode_item));
161 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
162 btrfs_set_inode_size(leaf, inode_item, 0);
163 btrfs_set_inode_nbytes(leaf, inode_item, 0);
164 btrfs_set_inode_uid(leaf, inode_item, 0);
165 btrfs_set_inode_gid(leaf, inode_item, 0);
166 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
167 btrfs_set_inode_flags(leaf, inode_item, flags);
168 btrfs_set_inode_nlink(leaf, inode_item, 1);
169 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
170 btrfs_set_inode_block_group(leaf, inode_item, offset);
171 btrfs_mark_buffer_dirty(leaf);
172 btrfs_release_path(path);
173
174 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
175 key.offset = offset;
176 key.type = 0;
177 ret = btrfs_insert_empty_item(trans, root, path, &key,
178 sizeof(struct btrfs_free_space_header));
179 if (ret < 0) {
180 btrfs_release_path(path);
181 return ret;
182 }
183
184 leaf = path->nodes[0];
185 header = btrfs_item_ptr(leaf, path->slots[0],
186 struct btrfs_free_space_header);
187 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
188 btrfs_set_free_space_key(leaf, header, &disk_key);
189 btrfs_mark_buffer_dirty(leaf);
190 btrfs_release_path(path);
191
192 return 0;
193 }
194
create_free_space_inode(struct btrfs_trans_handle * trans,struct btrfs_block_group * block_group,struct btrfs_path * path)195 int create_free_space_inode(struct btrfs_trans_handle *trans,
196 struct btrfs_block_group *block_group,
197 struct btrfs_path *path)
198 {
199 int ret;
200 u64 ino;
201
202 ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
203 if (ret < 0)
204 return ret;
205
206 return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
207 ino, block_group->start);
208 }
209
btrfs_check_trunc_cache_free_space(struct btrfs_fs_info * fs_info,struct btrfs_block_rsv * rsv)210 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
211 struct btrfs_block_rsv *rsv)
212 {
213 u64 needed_bytes;
214 int ret;
215
216 /* 1 for slack space, 1 for updating the inode */
217 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
218 btrfs_calc_metadata_size(fs_info, 1);
219
220 spin_lock(&rsv->lock);
221 if (rsv->reserved < needed_bytes)
222 ret = -ENOSPC;
223 else
224 ret = 0;
225 spin_unlock(&rsv->lock);
226 return ret;
227 }
228
btrfs_truncate_free_space_cache(struct btrfs_trans_handle * trans,struct btrfs_block_group * block_group,struct inode * inode)229 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
230 struct btrfs_block_group *block_group,
231 struct inode *inode)
232 {
233 struct btrfs_root *root = BTRFS_I(inode)->root;
234 int ret = 0;
235 bool locked = false;
236
237 if (block_group) {
238 struct btrfs_path *path = btrfs_alloc_path();
239
240 if (!path) {
241 ret = -ENOMEM;
242 goto fail;
243 }
244 locked = true;
245 mutex_lock(&trans->transaction->cache_write_mutex);
246 if (!list_empty(&block_group->io_list)) {
247 list_del_init(&block_group->io_list);
248
249 btrfs_wait_cache_io(trans, block_group, path);
250 btrfs_put_block_group(block_group);
251 }
252
253 /*
254 * now that we've truncated the cache away, its no longer
255 * setup or written
256 */
257 spin_lock(&block_group->lock);
258 block_group->disk_cache_state = BTRFS_DC_CLEAR;
259 spin_unlock(&block_group->lock);
260 btrfs_free_path(path);
261 }
262
263 btrfs_i_size_write(BTRFS_I(inode), 0);
264 truncate_pagecache(inode, 0);
265
266 /*
267 * We skip the throttling logic for free space cache inodes, so we don't
268 * need to check for -EAGAIN.
269 */
270 ret = btrfs_truncate_inode_items(trans, root, inode,
271 0, BTRFS_EXTENT_DATA_KEY);
272 if (ret)
273 goto fail;
274
275 ret = btrfs_update_inode(trans, root, inode);
276
277 fail:
278 if (locked)
279 mutex_unlock(&trans->transaction->cache_write_mutex);
280 if (ret)
281 btrfs_abort_transaction(trans, ret);
282
283 return ret;
284 }
285
readahead_cache(struct inode * inode)286 static void readahead_cache(struct inode *inode)
287 {
288 struct file_ra_state *ra;
289 unsigned long last_index;
290
291 ra = kzalloc(sizeof(*ra), GFP_NOFS);
292 if (!ra)
293 return;
294
295 file_ra_state_init(ra, inode->i_mapping);
296 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
297
298 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
299
300 kfree(ra);
301 }
302
io_ctl_init(struct btrfs_io_ctl * io_ctl,struct inode * inode,int write)303 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
304 int write)
305 {
306 int num_pages;
307 int check_crcs = 0;
308
309 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
310
311 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
312 check_crcs = 1;
313
314 /* Make sure we can fit our crcs and generation into the first page */
315 if (write && check_crcs &&
316 (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
317 return -ENOSPC;
318
319 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
320
321 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
322 if (!io_ctl->pages)
323 return -ENOMEM;
324
325 io_ctl->num_pages = num_pages;
326 io_ctl->fs_info = btrfs_sb(inode->i_sb);
327 io_ctl->check_crcs = check_crcs;
328 io_ctl->inode = inode;
329
330 return 0;
331 }
332 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
333
io_ctl_free(struct btrfs_io_ctl * io_ctl)334 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
335 {
336 kfree(io_ctl->pages);
337 io_ctl->pages = NULL;
338 }
339
io_ctl_unmap_page(struct btrfs_io_ctl * io_ctl)340 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
341 {
342 if (io_ctl->cur) {
343 io_ctl->cur = NULL;
344 io_ctl->orig = NULL;
345 }
346 }
347
io_ctl_map_page(struct btrfs_io_ctl * io_ctl,int clear)348 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
349 {
350 ASSERT(io_ctl->index < io_ctl->num_pages);
351 io_ctl->page = io_ctl->pages[io_ctl->index++];
352 io_ctl->cur = page_address(io_ctl->page);
353 io_ctl->orig = io_ctl->cur;
354 io_ctl->size = PAGE_SIZE;
355 if (clear)
356 clear_page(io_ctl->cur);
357 }
358
io_ctl_drop_pages(struct btrfs_io_ctl * io_ctl)359 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
360 {
361 int i;
362
363 io_ctl_unmap_page(io_ctl);
364
365 for (i = 0; i < io_ctl->num_pages; i++) {
366 if (io_ctl->pages[i]) {
367 ClearPageChecked(io_ctl->pages[i]);
368 unlock_page(io_ctl->pages[i]);
369 put_page(io_ctl->pages[i]);
370 }
371 }
372 }
373
io_ctl_prepare_pages(struct btrfs_io_ctl * io_ctl,bool uptodate)374 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
375 {
376 struct page *page;
377 struct inode *inode = io_ctl->inode;
378 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
379 int i;
380
381 for (i = 0; i < io_ctl->num_pages; i++) {
382 page = find_or_create_page(inode->i_mapping, i, mask);
383 if (!page) {
384 io_ctl_drop_pages(io_ctl);
385 return -ENOMEM;
386 }
387 io_ctl->pages[i] = page;
388 if (uptodate && !PageUptodate(page)) {
389 btrfs_readpage(NULL, page);
390 lock_page(page);
391 if (page->mapping != inode->i_mapping) {
392 btrfs_err(BTRFS_I(inode)->root->fs_info,
393 "free space cache page truncated");
394 io_ctl_drop_pages(io_ctl);
395 return -EIO;
396 }
397 if (!PageUptodate(page)) {
398 btrfs_err(BTRFS_I(inode)->root->fs_info,
399 "error reading free space cache");
400 io_ctl_drop_pages(io_ctl);
401 return -EIO;
402 }
403 }
404 }
405
406 for (i = 0; i < io_ctl->num_pages; i++) {
407 clear_page_dirty_for_io(io_ctl->pages[i]);
408 set_page_extent_mapped(io_ctl->pages[i]);
409 }
410
411 return 0;
412 }
413
io_ctl_set_generation(struct btrfs_io_ctl * io_ctl,u64 generation)414 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
415 {
416 io_ctl_map_page(io_ctl, 1);
417
418 /*
419 * Skip the csum areas. If we don't check crcs then we just have a
420 * 64bit chunk at the front of the first page.
421 */
422 if (io_ctl->check_crcs) {
423 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
424 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
425 } else {
426 io_ctl->cur += sizeof(u64);
427 io_ctl->size -= sizeof(u64) * 2;
428 }
429
430 put_unaligned_le64(generation, io_ctl->cur);
431 io_ctl->cur += sizeof(u64);
432 }
433
io_ctl_check_generation(struct btrfs_io_ctl * io_ctl,u64 generation)434 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
435 {
436 u64 cache_gen;
437
438 /*
439 * Skip the crc area. If we don't check crcs then we just have a 64bit
440 * chunk at the front of the first page.
441 */
442 if (io_ctl->check_crcs) {
443 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
444 io_ctl->size -= sizeof(u64) +
445 (sizeof(u32) * io_ctl->num_pages);
446 } else {
447 io_ctl->cur += sizeof(u64);
448 io_ctl->size -= sizeof(u64) * 2;
449 }
450
451 cache_gen = get_unaligned_le64(io_ctl->cur);
452 if (cache_gen != generation) {
453 btrfs_err_rl(io_ctl->fs_info,
454 "space cache generation (%llu) does not match inode (%llu)",
455 cache_gen, generation);
456 io_ctl_unmap_page(io_ctl);
457 return -EIO;
458 }
459 io_ctl->cur += sizeof(u64);
460 return 0;
461 }
462
io_ctl_set_crc(struct btrfs_io_ctl * io_ctl,int index)463 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
464 {
465 u32 *tmp;
466 u32 crc = ~(u32)0;
467 unsigned offset = 0;
468
469 if (!io_ctl->check_crcs) {
470 io_ctl_unmap_page(io_ctl);
471 return;
472 }
473
474 if (index == 0)
475 offset = sizeof(u32) * io_ctl->num_pages;
476
477 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
478 btrfs_crc32c_final(crc, (u8 *)&crc);
479 io_ctl_unmap_page(io_ctl);
480 tmp = page_address(io_ctl->pages[0]);
481 tmp += index;
482 *tmp = crc;
483 }
484
io_ctl_check_crc(struct btrfs_io_ctl * io_ctl,int index)485 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
486 {
487 u32 *tmp, val;
488 u32 crc = ~(u32)0;
489 unsigned offset = 0;
490
491 if (!io_ctl->check_crcs) {
492 io_ctl_map_page(io_ctl, 0);
493 return 0;
494 }
495
496 if (index == 0)
497 offset = sizeof(u32) * io_ctl->num_pages;
498
499 tmp = page_address(io_ctl->pages[0]);
500 tmp += index;
501 val = *tmp;
502
503 io_ctl_map_page(io_ctl, 0);
504 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
505 btrfs_crc32c_final(crc, (u8 *)&crc);
506 if (val != crc) {
507 btrfs_err_rl(io_ctl->fs_info,
508 "csum mismatch on free space cache");
509 io_ctl_unmap_page(io_ctl);
510 return -EIO;
511 }
512
513 return 0;
514 }
515
io_ctl_add_entry(struct btrfs_io_ctl * io_ctl,u64 offset,u64 bytes,void * bitmap)516 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
517 void *bitmap)
518 {
519 struct btrfs_free_space_entry *entry;
520
521 if (!io_ctl->cur)
522 return -ENOSPC;
523
524 entry = io_ctl->cur;
525 put_unaligned_le64(offset, &entry->offset);
526 put_unaligned_le64(bytes, &entry->bytes);
527 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
528 BTRFS_FREE_SPACE_EXTENT;
529 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
530 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
531
532 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
533 return 0;
534
535 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
536
537 /* No more pages to map */
538 if (io_ctl->index >= io_ctl->num_pages)
539 return 0;
540
541 /* map the next page */
542 io_ctl_map_page(io_ctl, 1);
543 return 0;
544 }
545
io_ctl_add_bitmap(struct btrfs_io_ctl * io_ctl,void * bitmap)546 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
547 {
548 if (!io_ctl->cur)
549 return -ENOSPC;
550
551 /*
552 * If we aren't at the start of the current page, unmap this one and
553 * map the next one if there is any left.
554 */
555 if (io_ctl->cur != io_ctl->orig) {
556 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
557 if (io_ctl->index >= io_ctl->num_pages)
558 return -ENOSPC;
559 io_ctl_map_page(io_ctl, 0);
560 }
561
562 copy_page(io_ctl->cur, bitmap);
563 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
564 if (io_ctl->index < io_ctl->num_pages)
565 io_ctl_map_page(io_ctl, 0);
566 return 0;
567 }
568
io_ctl_zero_remaining_pages(struct btrfs_io_ctl * io_ctl)569 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
570 {
571 /*
572 * If we're not on the boundary we know we've modified the page and we
573 * need to crc the page.
574 */
575 if (io_ctl->cur != io_ctl->orig)
576 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
577 else
578 io_ctl_unmap_page(io_ctl);
579
580 while (io_ctl->index < io_ctl->num_pages) {
581 io_ctl_map_page(io_ctl, 1);
582 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
583 }
584 }
585
io_ctl_read_entry(struct btrfs_io_ctl * io_ctl,struct btrfs_free_space * entry,u8 * type)586 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
587 struct btrfs_free_space *entry, u8 *type)
588 {
589 struct btrfs_free_space_entry *e;
590 int ret;
591
592 if (!io_ctl->cur) {
593 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
594 if (ret)
595 return ret;
596 }
597
598 e = io_ctl->cur;
599 entry->offset = get_unaligned_le64(&e->offset);
600 entry->bytes = get_unaligned_le64(&e->bytes);
601 *type = e->type;
602 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
603 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
604
605 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
606 return 0;
607
608 io_ctl_unmap_page(io_ctl);
609
610 return 0;
611 }
612
io_ctl_read_bitmap(struct btrfs_io_ctl * io_ctl,struct btrfs_free_space * entry)613 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
614 struct btrfs_free_space *entry)
615 {
616 int ret;
617
618 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
619 if (ret)
620 return ret;
621
622 copy_page(entry->bitmap, io_ctl->cur);
623 io_ctl_unmap_page(io_ctl);
624
625 return 0;
626 }
627
628 /*
629 * Since we attach pinned extents after the fact we can have contiguous sections
630 * of free space that are split up in entries. This poses a problem with the
631 * tree logging stuff since it could have allocated across what appears to be 2
632 * entries since we would have merged the entries when adding the pinned extents
633 * back to the free space cache. So run through the space cache that we just
634 * loaded and merge contiguous entries. This will make the log replay stuff not
635 * blow up and it will make for nicer allocator behavior.
636 */
merge_space_tree(struct btrfs_free_space_ctl * ctl)637 static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
638 {
639 struct btrfs_free_space *e, *prev = NULL;
640 struct rb_node *n;
641
642 again:
643 spin_lock(&ctl->tree_lock);
644 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
645 e = rb_entry(n, struct btrfs_free_space, offset_index);
646 if (!prev)
647 goto next;
648 if (e->bitmap || prev->bitmap)
649 goto next;
650 if (prev->offset + prev->bytes == e->offset) {
651 unlink_free_space(ctl, prev);
652 unlink_free_space(ctl, e);
653 prev->bytes += e->bytes;
654 kmem_cache_free(btrfs_free_space_cachep, e);
655 link_free_space(ctl, prev);
656 prev = NULL;
657 spin_unlock(&ctl->tree_lock);
658 goto again;
659 }
660 next:
661 prev = e;
662 }
663 spin_unlock(&ctl->tree_lock);
664 }
665
__load_free_space_cache(struct btrfs_root * root,struct inode * inode,struct btrfs_free_space_ctl * ctl,struct btrfs_path * path,u64 offset)666 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
667 struct btrfs_free_space_ctl *ctl,
668 struct btrfs_path *path, u64 offset)
669 {
670 struct btrfs_fs_info *fs_info = root->fs_info;
671 struct btrfs_free_space_header *header;
672 struct extent_buffer *leaf;
673 struct btrfs_io_ctl io_ctl;
674 struct btrfs_key key;
675 struct btrfs_free_space *e, *n;
676 LIST_HEAD(bitmaps);
677 u64 num_entries;
678 u64 num_bitmaps;
679 u64 generation;
680 u8 type;
681 int ret = 0;
682
683 /* Nothing in the space cache, goodbye */
684 if (!i_size_read(inode))
685 return 0;
686
687 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
688 key.offset = offset;
689 key.type = 0;
690
691 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
692 if (ret < 0)
693 return 0;
694 else if (ret > 0) {
695 btrfs_release_path(path);
696 return 0;
697 }
698
699 ret = -1;
700
701 leaf = path->nodes[0];
702 header = btrfs_item_ptr(leaf, path->slots[0],
703 struct btrfs_free_space_header);
704 num_entries = btrfs_free_space_entries(leaf, header);
705 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
706 generation = btrfs_free_space_generation(leaf, header);
707 btrfs_release_path(path);
708
709 if (!BTRFS_I(inode)->generation) {
710 btrfs_info(fs_info,
711 "the free space cache file (%llu) is invalid, skip it",
712 offset);
713 return 0;
714 }
715
716 if (BTRFS_I(inode)->generation != generation) {
717 btrfs_err(fs_info,
718 "free space inode generation (%llu) did not match free space cache generation (%llu)",
719 BTRFS_I(inode)->generation, generation);
720 return 0;
721 }
722
723 if (!num_entries)
724 return 0;
725
726 ret = io_ctl_init(&io_ctl, inode, 0);
727 if (ret)
728 return ret;
729
730 readahead_cache(inode);
731
732 ret = io_ctl_prepare_pages(&io_ctl, true);
733 if (ret)
734 goto out;
735
736 ret = io_ctl_check_crc(&io_ctl, 0);
737 if (ret)
738 goto free_cache;
739
740 ret = io_ctl_check_generation(&io_ctl, generation);
741 if (ret)
742 goto free_cache;
743
744 while (num_entries) {
745 e = kmem_cache_zalloc(btrfs_free_space_cachep,
746 GFP_NOFS);
747 if (!e) {
748 ret = -ENOMEM;
749 goto free_cache;
750 }
751
752 ret = io_ctl_read_entry(&io_ctl, e, &type);
753 if (ret) {
754 kmem_cache_free(btrfs_free_space_cachep, e);
755 goto free_cache;
756 }
757
758 /*
759 * Sync discard ensures that the free space cache is always
760 * trimmed. So when reading this in, the state should reflect
761 * that. We also do this for async as a stop gap for lack of
762 * persistence.
763 */
764 if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
765 btrfs_test_opt(fs_info, DISCARD_ASYNC))
766 e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
767
768 if (!e->bytes) {
769 ret = -1;
770 kmem_cache_free(btrfs_free_space_cachep, e);
771 goto free_cache;
772 }
773
774 if (type == BTRFS_FREE_SPACE_EXTENT) {
775 spin_lock(&ctl->tree_lock);
776 ret = link_free_space(ctl, e);
777 spin_unlock(&ctl->tree_lock);
778 if (ret) {
779 btrfs_err(fs_info,
780 "Duplicate entries in free space cache, dumping");
781 kmem_cache_free(btrfs_free_space_cachep, e);
782 goto free_cache;
783 }
784 } else {
785 ASSERT(num_bitmaps);
786 num_bitmaps--;
787 e->bitmap = kmem_cache_zalloc(
788 btrfs_free_space_bitmap_cachep, GFP_NOFS);
789 if (!e->bitmap) {
790 ret = -ENOMEM;
791 kmem_cache_free(
792 btrfs_free_space_cachep, e);
793 goto free_cache;
794 }
795 spin_lock(&ctl->tree_lock);
796 ret = link_free_space(ctl, e);
797 ctl->total_bitmaps++;
798 ctl->op->recalc_thresholds(ctl);
799 spin_unlock(&ctl->tree_lock);
800 if (ret) {
801 btrfs_err(fs_info,
802 "Duplicate entries in free space cache, dumping");
803 kmem_cache_free(btrfs_free_space_cachep, e);
804 goto free_cache;
805 }
806 list_add_tail(&e->list, &bitmaps);
807 }
808
809 num_entries--;
810 }
811
812 io_ctl_unmap_page(&io_ctl);
813
814 /*
815 * We add the bitmaps at the end of the entries in order that
816 * the bitmap entries are added to the cache.
817 */
818 list_for_each_entry_safe(e, n, &bitmaps, list) {
819 list_del_init(&e->list);
820 ret = io_ctl_read_bitmap(&io_ctl, e);
821 if (ret)
822 goto free_cache;
823 e->bitmap_extents = count_bitmap_extents(ctl, e);
824 if (!btrfs_free_space_trimmed(e)) {
825 ctl->discardable_extents[BTRFS_STAT_CURR] +=
826 e->bitmap_extents;
827 ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes;
828 }
829 }
830
831 io_ctl_drop_pages(&io_ctl);
832 merge_space_tree(ctl);
833 ret = 1;
834 out:
835 btrfs_discard_update_discardable(ctl->private, ctl);
836 io_ctl_free(&io_ctl);
837 return ret;
838 free_cache:
839 io_ctl_drop_pages(&io_ctl);
840 __btrfs_remove_free_space_cache(ctl);
841 goto out;
842 }
843
load_free_space_cache(struct btrfs_block_group * block_group)844 int load_free_space_cache(struct btrfs_block_group *block_group)
845 {
846 struct btrfs_fs_info *fs_info = block_group->fs_info;
847 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
848 struct inode *inode;
849 struct btrfs_path *path;
850 int ret = 0;
851 bool matched;
852 u64 used = block_group->used;
853
854 /*
855 * If this block group has been marked to be cleared for one reason or
856 * another then we can't trust the on disk cache, so just return.
857 */
858 spin_lock(&block_group->lock);
859 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
860 spin_unlock(&block_group->lock);
861 return 0;
862 }
863 spin_unlock(&block_group->lock);
864
865 path = btrfs_alloc_path();
866 if (!path)
867 return 0;
868 path->search_commit_root = 1;
869 path->skip_locking = 1;
870
871 /*
872 * We must pass a path with search_commit_root set to btrfs_iget in
873 * order to avoid a deadlock when allocating extents for the tree root.
874 *
875 * When we are COWing an extent buffer from the tree root, when looking
876 * for a free extent, at extent-tree.c:find_free_extent(), we can find
877 * block group without its free space cache loaded. When we find one
878 * we must load its space cache which requires reading its free space
879 * cache's inode item from the root tree. If this inode item is located
880 * in the same leaf that we started COWing before, then we end up in
881 * deadlock on the extent buffer (trying to read lock it when we
882 * previously write locked it).
883 *
884 * It's safe to read the inode item using the commit root because
885 * block groups, once loaded, stay in memory forever (until they are
886 * removed) as well as their space caches once loaded. New block groups
887 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
888 * we will never try to read their inode item while the fs is mounted.
889 */
890 inode = lookup_free_space_inode(block_group, path);
891 if (IS_ERR(inode)) {
892 btrfs_free_path(path);
893 return 0;
894 }
895
896 /* We may have converted the inode and made the cache invalid. */
897 spin_lock(&block_group->lock);
898 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
899 spin_unlock(&block_group->lock);
900 btrfs_free_path(path);
901 goto out;
902 }
903 spin_unlock(&block_group->lock);
904
905 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
906 path, block_group->start);
907 btrfs_free_path(path);
908 if (ret <= 0)
909 goto out;
910
911 spin_lock(&ctl->tree_lock);
912 matched = (ctl->free_space == (block_group->length - used -
913 block_group->bytes_super));
914 spin_unlock(&ctl->tree_lock);
915
916 if (!matched) {
917 __btrfs_remove_free_space_cache(ctl);
918 btrfs_warn(fs_info,
919 "block group %llu has wrong amount of free space",
920 block_group->start);
921 ret = -1;
922 }
923 out:
924 if (ret < 0) {
925 /* This cache is bogus, make sure it gets cleared */
926 spin_lock(&block_group->lock);
927 block_group->disk_cache_state = BTRFS_DC_CLEAR;
928 spin_unlock(&block_group->lock);
929 ret = 0;
930
931 btrfs_warn(fs_info,
932 "failed to load free space cache for block group %llu, rebuilding it now",
933 block_group->start);
934 }
935
936 iput(inode);
937 return ret;
938 }
939
940 static noinline_for_stack
write_cache_extent_entries(struct btrfs_io_ctl * io_ctl,struct btrfs_free_space_ctl * ctl,struct btrfs_block_group * block_group,int * entries,int * bitmaps,struct list_head * bitmap_list)941 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
942 struct btrfs_free_space_ctl *ctl,
943 struct btrfs_block_group *block_group,
944 int *entries, int *bitmaps,
945 struct list_head *bitmap_list)
946 {
947 int ret;
948 struct btrfs_free_cluster *cluster = NULL;
949 struct btrfs_free_cluster *cluster_locked = NULL;
950 struct rb_node *node = rb_first(&ctl->free_space_offset);
951 struct btrfs_trim_range *trim_entry;
952
953 /* Get the cluster for this block_group if it exists */
954 if (block_group && !list_empty(&block_group->cluster_list)) {
955 cluster = list_entry(block_group->cluster_list.next,
956 struct btrfs_free_cluster,
957 block_group_list);
958 }
959
960 if (!node && cluster) {
961 cluster_locked = cluster;
962 spin_lock(&cluster_locked->lock);
963 node = rb_first(&cluster->root);
964 cluster = NULL;
965 }
966
967 /* Write out the extent entries */
968 while (node) {
969 struct btrfs_free_space *e;
970
971 e = rb_entry(node, struct btrfs_free_space, offset_index);
972 *entries += 1;
973
974 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
975 e->bitmap);
976 if (ret)
977 goto fail;
978
979 if (e->bitmap) {
980 list_add_tail(&e->list, bitmap_list);
981 *bitmaps += 1;
982 }
983 node = rb_next(node);
984 if (!node && cluster) {
985 node = rb_first(&cluster->root);
986 cluster_locked = cluster;
987 spin_lock(&cluster_locked->lock);
988 cluster = NULL;
989 }
990 }
991 if (cluster_locked) {
992 spin_unlock(&cluster_locked->lock);
993 cluster_locked = NULL;
994 }
995
996 /*
997 * Make sure we don't miss any range that was removed from our rbtree
998 * because trimming is running. Otherwise after a umount+mount (or crash
999 * after committing the transaction) we would leak free space and get
1000 * an inconsistent free space cache report from fsck.
1001 */
1002 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1003 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1004 trim_entry->bytes, NULL);
1005 if (ret)
1006 goto fail;
1007 *entries += 1;
1008 }
1009
1010 return 0;
1011 fail:
1012 if (cluster_locked)
1013 spin_unlock(&cluster_locked->lock);
1014 return -ENOSPC;
1015 }
1016
1017 static noinline_for_stack int
update_cache_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode,struct btrfs_path * path,u64 offset,int entries,int bitmaps)1018 update_cache_item(struct btrfs_trans_handle *trans,
1019 struct btrfs_root *root,
1020 struct inode *inode,
1021 struct btrfs_path *path, u64 offset,
1022 int entries, int bitmaps)
1023 {
1024 struct btrfs_key key;
1025 struct btrfs_free_space_header *header;
1026 struct extent_buffer *leaf;
1027 int ret;
1028
1029 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1030 key.offset = offset;
1031 key.type = 0;
1032
1033 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1034 if (ret < 0) {
1035 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1036 EXTENT_DELALLOC, 0, 0, NULL);
1037 goto fail;
1038 }
1039 leaf = path->nodes[0];
1040 if (ret > 0) {
1041 struct btrfs_key found_key;
1042 ASSERT(path->slots[0]);
1043 path->slots[0]--;
1044 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1045 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1046 found_key.offset != offset) {
1047 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1048 inode->i_size - 1, EXTENT_DELALLOC, 0,
1049 0, NULL);
1050 btrfs_release_path(path);
1051 goto fail;
1052 }
1053 }
1054
1055 BTRFS_I(inode)->generation = trans->transid;
1056 header = btrfs_item_ptr(leaf, path->slots[0],
1057 struct btrfs_free_space_header);
1058 btrfs_set_free_space_entries(leaf, header, entries);
1059 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1060 btrfs_set_free_space_generation(leaf, header, trans->transid);
1061 btrfs_mark_buffer_dirty(leaf);
1062 btrfs_release_path(path);
1063
1064 return 0;
1065
1066 fail:
1067 return -1;
1068 }
1069
write_pinned_extent_entries(struct btrfs_trans_handle * trans,struct btrfs_block_group * block_group,struct btrfs_io_ctl * io_ctl,int * entries)1070 static noinline_for_stack int write_pinned_extent_entries(
1071 struct btrfs_trans_handle *trans,
1072 struct btrfs_block_group *block_group,
1073 struct btrfs_io_ctl *io_ctl,
1074 int *entries)
1075 {
1076 u64 start, extent_start, extent_end, len;
1077 struct extent_io_tree *unpin = NULL;
1078 int ret;
1079
1080 if (!block_group)
1081 return 0;
1082
1083 /*
1084 * We want to add any pinned extents to our free space cache
1085 * so we don't leak the space
1086 *
1087 * We shouldn't have switched the pinned extents yet so this is the
1088 * right one
1089 */
1090 unpin = &trans->transaction->pinned_extents;
1091
1092 start = block_group->start;
1093
1094 while (start < block_group->start + block_group->length) {
1095 ret = find_first_extent_bit(unpin, start,
1096 &extent_start, &extent_end,
1097 EXTENT_DIRTY, NULL);
1098 if (ret)
1099 return 0;
1100
1101 /* This pinned extent is out of our range */
1102 if (extent_start >= block_group->start + block_group->length)
1103 return 0;
1104
1105 extent_start = max(extent_start, start);
1106 extent_end = min(block_group->start + block_group->length,
1107 extent_end + 1);
1108 len = extent_end - extent_start;
1109
1110 *entries += 1;
1111 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1112 if (ret)
1113 return -ENOSPC;
1114
1115 start = extent_end;
1116 }
1117
1118 return 0;
1119 }
1120
1121 static noinline_for_stack int
write_bitmap_entries(struct btrfs_io_ctl * io_ctl,struct list_head * bitmap_list)1122 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1123 {
1124 struct btrfs_free_space *entry, *next;
1125 int ret;
1126
1127 /* Write out the bitmaps */
1128 list_for_each_entry_safe(entry, next, bitmap_list, list) {
1129 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1130 if (ret)
1131 return -ENOSPC;
1132 list_del_init(&entry->list);
1133 }
1134
1135 return 0;
1136 }
1137
flush_dirty_cache(struct inode * inode)1138 static int flush_dirty_cache(struct inode *inode)
1139 {
1140 int ret;
1141
1142 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1143 if (ret)
1144 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1145 EXTENT_DELALLOC, 0, 0, NULL);
1146
1147 return ret;
1148 }
1149
1150 static void noinline_for_stack
cleanup_bitmap_list(struct list_head * bitmap_list)1151 cleanup_bitmap_list(struct list_head *bitmap_list)
1152 {
1153 struct btrfs_free_space *entry, *next;
1154
1155 list_for_each_entry_safe(entry, next, bitmap_list, list)
1156 list_del_init(&entry->list);
1157 }
1158
1159 static void noinline_for_stack
cleanup_write_cache_enospc(struct inode * inode,struct btrfs_io_ctl * io_ctl,struct extent_state ** cached_state)1160 cleanup_write_cache_enospc(struct inode *inode,
1161 struct btrfs_io_ctl *io_ctl,
1162 struct extent_state **cached_state)
1163 {
1164 io_ctl_drop_pages(io_ctl);
1165 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1166 i_size_read(inode) - 1, cached_state);
1167 }
1168
__btrfs_wait_cache_io(struct btrfs_root * root,struct btrfs_trans_handle * trans,struct btrfs_block_group * block_group,struct btrfs_io_ctl * io_ctl,struct btrfs_path * path,u64 offset)1169 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1170 struct btrfs_trans_handle *trans,
1171 struct btrfs_block_group *block_group,
1172 struct btrfs_io_ctl *io_ctl,
1173 struct btrfs_path *path, u64 offset)
1174 {
1175 int ret;
1176 struct inode *inode = io_ctl->inode;
1177
1178 if (!inode)
1179 return 0;
1180
1181 /* Flush the dirty pages in the cache file. */
1182 ret = flush_dirty_cache(inode);
1183 if (ret)
1184 goto out;
1185
1186 /* Update the cache item to tell everyone this cache file is valid. */
1187 ret = update_cache_item(trans, root, inode, path, offset,
1188 io_ctl->entries, io_ctl->bitmaps);
1189 out:
1190 if (ret) {
1191 invalidate_inode_pages2(inode->i_mapping);
1192 BTRFS_I(inode)->generation = 0;
1193 if (block_group)
1194 btrfs_debug(root->fs_info,
1195 "failed to write free space cache for block group %llu error %d",
1196 block_group->start, ret);
1197 }
1198 btrfs_update_inode(trans, root, inode);
1199
1200 if (block_group) {
1201 /* the dirty list is protected by the dirty_bgs_lock */
1202 spin_lock(&trans->transaction->dirty_bgs_lock);
1203
1204 /* the disk_cache_state is protected by the block group lock */
1205 spin_lock(&block_group->lock);
1206
1207 /*
1208 * only mark this as written if we didn't get put back on
1209 * the dirty list while waiting for IO. Otherwise our
1210 * cache state won't be right, and we won't get written again
1211 */
1212 if (!ret && list_empty(&block_group->dirty_list))
1213 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1214 else if (ret)
1215 block_group->disk_cache_state = BTRFS_DC_ERROR;
1216
1217 spin_unlock(&block_group->lock);
1218 spin_unlock(&trans->transaction->dirty_bgs_lock);
1219 io_ctl->inode = NULL;
1220 iput(inode);
1221 }
1222
1223 return ret;
1224
1225 }
1226
btrfs_wait_cache_io_root(struct btrfs_root * root,struct btrfs_trans_handle * trans,struct btrfs_io_ctl * io_ctl,struct btrfs_path * path)1227 static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1228 struct btrfs_trans_handle *trans,
1229 struct btrfs_io_ctl *io_ctl,
1230 struct btrfs_path *path)
1231 {
1232 return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1233 }
1234
btrfs_wait_cache_io(struct btrfs_trans_handle * trans,struct btrfs_block_group * block_group,struct btrfs_path * path)1235 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1236 struct btrfs_block_group *block_group,
1237 struct btrfs_path *path)
1238 {
1239 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1240 block_group, &block_group->io_ctl,
1241 path, block_group->start);
1242 }
1243
1244 /**
1245 * __btrfs_write_out_cache - write out cached info to an inode
1246 * @root - the root the inode belongs to
1247 * @ctl - the free space cache we are going to write out
1248 * @block_group - the block_group for this cache if it belongs to a block_group
1249 * @trans - the trans handle
1250 *
1251 * This function writes out a free space cache struct to disk for quick recovery
1252 * on mount. This will return 0 if it was successful in writing the cache out,
1253 * or an errno if it was not.
1254 */
__btrfs_write_out_cache(struct btrfs_root * root,struct inode * inode,struct btrfs_free_space_ctl * ctl,struct btrfs_block_group * block_group,struct btrfs_io_ctl * io_ctl,struct btrfs_trans_handle * trans)1255 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1256 struct btrfs_free_space_ctl *ctl,
1257 struct btrfs_block_group *block_group,
1258 struct btrfs_io_ctl *io_ctl,
1259 struct btrfs_trans_handle *trans)
1260 {
1261 struct extent_state *cached_state = NULL;
1262 LIST_HEAD(bitmap_list);
1263 int entries = 0;
1264 int bitmaps = 0;
1265 int ret;
1266 int must_iput = 0;
1267
1268 if (!i_size_read(inode))
1269 return -EIO;
1270
1271 WARN_ON(io_ctl->pages);
1272 ret = io_ctl_init(io_ctl, inode, 1);
1273 if (ret)
1274 return ret;
1275
1276 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1277 down_write(&block_group->data_rwsem);
1278 spin_lock(&block_group->lock);
1279 if (block_group->delalloc_bytes) {
1280 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1281 spin_unlock(&block_group->lock);
1282 up_write(&block_group->data_rwsem);
1283 BTRFS_I(inode)->generation = 0;
1284 ret = 0;
1285 must_iput = 1;
1286 goto out;
1287 }
1288 spin_unlock(&block_group->lock);
1289 }
1290
1291 /* Lock all pages first so we can lock the extent safely. */
1292 ret = io_ctl_prepare_pages(io_ctl, false);
1293 if (ret)
1294 goto out_unlock;
1295
1296 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1297 &cached_state);
1298
1299 io_ctl_set_generation(io_ctl, trans->transid);
1300
1301 mutex_lock(&ctl->cache_writeout_mutex);
1302 /* Write out the extent entries in the free space cache */
1303 spin_lock(&ctl->tree_lock);
1304 ret = write_cache_extent_entries(io_ctl, ctl,
1305 block_group, &entries, &bitmaps,
1306 &bitmap_list);
1307 if (ret)
1308 goto out_nospc_locked;
1309
1310 /*
1311 * Some spaces that are freed in the current transaction are pinned,
1312 * they will be added into free space cache after the transaction is
1313 * committed, we shouldn't lose them.
1314 *
1315 * If this changes while we are working we'll get added back to
1316 * the dirty list and redo it. No locking needed
1317 */
1318 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1319 if (ret)
1320 goto out_nospc_locked;
1321
1322 /*
1323 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1324 * locked while doing it because a concurrent trim can be manipulating
1325 * or freeing the bitmap.
1326 */
1327 ret = write_bitmap_entries(io_ctl, &bitmap_list);
1328 spin_unlock(&ctl->tree_lock);
1329 mutex_unlock(&ctl->cache_writeout_mutex);
1330 if (ret)
1331 goto out_nospc;
1332
1333 /* Zero out the rest of the pages just to make sure */
1334 io_ctl_zero_remaining_pages(io_ctl);
1335
1336 /* Everything is written out, now we dirty the pages in the file. */
1337 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1338 io_ctl->num_pages, 0, i_size_read(inode),
1339 &cached_state);
1340 if (ret)
1341 goto out_nospc;
1342
1343 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1344 up_write(&block_group->data_rwsem);
1345 /*
1346 * Release the pages and unlock the extent, we will flush
1347 * them out later
1348 */
1349 io_ctl_drop_pages(io_ctl);
1350 io_ctl_free(io_ctl);
1351
1352 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1353 i_size_read(inode) - 1, &cached_state);
1354
1355 /*
1356 * at this point the pages are under IO and we're happy,
1357 * The caller is responsible for waiting on them and updating
1358 * the cache and the inode
1359 */
1360 io_ctl->entries = entries;
1361 io_ctl->bitmaps = bitmaps;
1362
1363 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1364 if (ret)
1365 goto out;
1366
1367 return 0;
1368
1369 out_nospc_locked:
1370 cleanup_bitmap_list(&bitmap_list);
1371 spin_unlock(&ctl->tree_lock);
1372 mutex_unlock(&ctl->cache_writeout_mutex);
1373
1374 out_nospc:
1375 cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1376
1377 out_unlock:
1378 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1379 up_write(&block_group->data_rwsem);
1380
1381 out:
1382 io_ctl->inode = NULL;
1383 io_ctl_free(io_ctl);
1384 if (ret) {
1385 invalidate_inode_pages2(inode->i_mapping);
1386 BTRFS_I(inode)->generation = 0;
1387 }
1388 btrfs_update_inode(trans, root, inode);
1389 if (must_iput)
1390 iput(inode);
1391 return ret;
1392 }
1393
btrfs_write_out_cache(struct btrfs_trans_handle * trans,struct btrfs_block_group * block_group,struct btrfs_path * path)1394 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1395 struct btrfs_block_group *block_group,
1396 struct btrfs_path *path)
1397 {
1398 struct btrfs_fs_info *fs_info = trans->fs_info;
1399 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1400 struct inode *inode;
1401 int ret = 0;
1402
1403 spin_lock(&block_group->lock);
1404 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1405 spin_unlock(&block_group->lock);
1406 return 0;
1407 }
1408 spin_unlock(&block_group->lock);
1409
1410 inode = lookup_free_space_inode(block_group, path);
1411 if (IS_ERR(inode))
1412 return 0;
1413
1414 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1415 block_group, &block_group->io_ctl, trans);
1416 if (ret) {
1417 btrfs_debug(fs_info,
1418 "failed to write free space cache for block group %llu error %d",
1419 block_group->start, ret);
1420 spin_lock(&block_group->lock);
1421 block_group->disk_cache_state = BTRFS_DC_ERROR;
1422 spin_unlock(&block_group->lock);
1423
1424 block_group->io_ctl.inode = NULL;
1425 iput(inode);
1426 }
1427
1428 /*
1429 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1430 * to wait for IO and put the inode
1431 */
1432
1433 return ret;
1434 }
1435
offset_to_bit(u64 bitmap_start,u32 unit,u64 offset)1436 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1437 u64 offset)
1438 {
1439 ASSERT(offset >= bitmap_start);
1440 offset -= bitmap_start;
1441 return (unsigned long)(div_u64(offset, unit));
1442 }
1443
bytes_to_bits(u64 bytes,u32 unit)1444 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1445 {
1446 return (unsigned long)(div_u64(bytes, unit));
1447 }
1448
offset_to_bitmap(struct btrfs_free_space_ctl * ctl,u64 offset)1449 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1450 u64 offset)
1451 {
1452 u64 bitmap_start;
1453 u64 bytes_per_bitmap;
1454
1455 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1456 bitmap_start = offset - ctl->start;
1457 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1458 bitmap_start *= bytes_per_bitmap;
1459 bitmap_start += ctl->start;
1460
1461 return bitmap_start;
1462 }
1463
tree_insert_offset(struct rb_root * root,u64 offset,struct rb_node * node,int bitmap)1464 static int tree_insert_offset(struct rb_root *root, u64 offset,
1465 struct rb_node *node, int bitmap)
1466 {
1467 struct rb_node **p = &root->rb_node;
1468 struct rb_node *parent = NULL;
1469 struct btrfs_free_space *info;
1470
1471 while (*p) {
1472 parent = *p;
1473 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1474
1475 if (offset < info->offset) {
1476 p = &(*p)->rb_left;
1477 } else if (offset > info->offset) {
1478 p = &(*p)->rb_right;
1479 } else {
1480 /*
1481 * we could have a bitmap entry and an extent entry
1482 * share the same offset. If this is the case, we want
1483 * the extent entry to always be found first if we do a
1484 * linear search through the tree, since we want to have
1485 * the quickest allocation time, and allocating from an
1486 * extent is faster than allocating from a bitmap. So
1487 * if we're inserting a bitmap and we find an entry at
1488 * this offset, we want to go right, or after this entry
1489 * logically. If we are inserting an extent and we've
1490 * found a bitmap, we want to go left, or before
1491 * logically.
1492 */
1493 if (bitmap) {
1494 if (info->bitmap) {
1495 WARN_ON_ONCE(1);
1496 return -EEXIST;
1497 }
1498 p = &(*p)->rb_right;
1499 } else {
1500 if (!info->bitmap) {
1501 WARN_ON_ONCE(1);
1502 return -EEXIST;
1503 }
1504 p = &(*p)->rb_left;
1505 }
1506 }
1507 }
1508
1509 rb_link_node(node, parent, p);
1510 rb_insert_color(node, root);
1511
1512 return 0;
1513 }
1514
1515 /*
1516 * searches the tree for the given offset.
1517 *
1518 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1519 * want a section that has at least bytes size and comes at or after the given
1520 * offset.
1521 */
1522 static struct btrfs_free_space *
tree_search_offset(struct btrfs_free_space_ctl * ctl,u64 offset,int bitmap_only,int fuzzy)1523 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1524 u64 offset, int bitmap_only, int fuzzy)
1525 {
1526 struct rb_node *n = ctl->free_space_offset.rb_node;
1527 struct btrfs_free_space *entry, *prev = NULL;
1528
1529 /* find entry that is closest to the 'offset' */
1530 while (1) {
1531 if (!n) {
1532 entry = NULL;
1533 break;
1534 }
1535
1536 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1537 prev = entry;
1538
1539 if (offset < entry->offset)
1540 n = n->rb_left;
1541 else if (offset > entry->offset)
1542 n = n->rb_right;
1543 else
1544 break;
1545 }
1546
1547 if (bitmap_only) {
1548 if (!entry)
1549 return NULL;
1550 if (entry->bitmap)
1551 return entry;
1552
1553 /*
1554 * bitmap entry and extent entry may share same offset,
1555 * in that case, bitmap entry comes after extent entry.
1556 */
1557 n = rb_next(n);
1558 if (!n)
1559 return NULL;
1560 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1561 if (entry->offset != offset)
1562 return NULL;
1563
1564 WARN_ON(!entry->bitmap);
1565 return entry;
1566 } else if (entry) {
1567 if (entry->bitmap) {
1568 /*
1569 * if previous extent entry covers the offset,
1570 * we should return it instead of the bitmap entry
1571 */
1572 n = rb_prev(&entry->offset_index);
1573 if (n) {
1574 prev = rb_entry(n, struct btrfs_free_space,
1575 offset_index);
1576 if (!prev->bitmap &&
1577 prev->offset + prev->bytes > offset)
1578 entry = prev;
1579 }
1580 }
1581 return entry;
1582 }
1583
1584 if (!prev)
1585 return NULL;
1586
1587 /* find last entry before the 'offset' */
1588 entry = prev;
1589 if (entry->offset > offset) {
1590 n = rb_prev(&entry->offset_index);
1591 if (n) {
1592 entry = rb_entry(n, struct btrfs_free_space,
1593 offset_index);
1594 ASSERT(entry->offset <= offset);
1595 } else {
1596 if (fuzzy)
1597 return entry;
1598 else
1599 return NULL;
1600 }
1601 }
1602
1603 if (entry->bitmap) {
1604 n = rb_prev(&entry->offset_index);
1605 if (n) {
1606 prev = rb_entry(n, struct btrfs_free_space,
1607 offset_index);
1608 if (!prev->bitmap &&
1609 prev->offset + prev->bytes > offset)
1610 return prev;
1611 }
1612 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1613 return entry;
1614 } else if (entry->offset + entry->bytes > offset)
1615 return entry;
1616
1617 if (!fuzzy)
1618 return NULL;
1619
1620 while (1) {
1621 if (entry->bitmap) {
1622 if (entry->offset + BITS_PER_BITMAP *
1623 ctl->unit > offset)
1624 break;
1625 } else {
1626 if (entry->offset + entry->bytes > offset)
1627 break;
1628 }
1629
1630 n = rb_next(&entry->offset_index);
1631 if (!n)
1632 return NULL;
1633 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1634 }
1635 return entry;
1636 }
1637
1638 static inline void
__unlink_free_space(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info)1639 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1640 struct btrfs_free_space *info)
1641 {
1642 rb_erase(&info->offset_index, &ctl->free_space_offset);
1643 ctl->free_extents--;
1644
1645 if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1646 ctl->discardable_extents[BTRFS_STAT_CURR]--;
1647 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1648 }
1649 }
1650
unlink_free_space(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info)1651 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1652 struct btrfs_free_space *info)
1653 {
1654 __unlink_free_space(ctl, info);
1655 ctl->free_space -= info->bytes;
1656 }
1657
link_free_space(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info)1658 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1659 struct btrfs_free_space *info)
1660 {
1661 int ret = 0;
1662
1663 ASSERT(info->bytes || info->bitmap);
1664 ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1665 &info->offset_index, (info->bitmap != NULL));
1666 if (ret)
1667 return ret;
1668
1669 if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1670 ctl->discardable_extents[BTRFS_STAT_CURR]++;
1671 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1672 }
1673
1674 ctl->free_space += info->bytes;
1675 ctl->free_extents++;
1676 return ret;
1677 }
1678
recalculate_thresholds(struct btrfs_free_space_ctl * ctl)1679 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1680 {
1681 struct btrfs_block_group *block_group = ctl->private;
1682 u64 max_bytes;
1683 u64 bitmap_bytes;
1684 u64 extent_bytes;
1685 u64 size = block_group->length;
1686 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1687 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1688
1689 max_bitmaps = max_t(u64, max_bitmaps, 1);
1690
1691 ASSERT(ctl->total_bitmaps <= max_bitmaps);
1692
1693 /*
1694 * We are trying to keep the total amount of memory used per 1GiB of
1695 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
1696 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
1697 * bitmaps, we may end up using more memory than this.
1698 */
1699 if (size < SZ_1G)
1700 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1701 else
1702 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1703
1704 bitmap_bytes = ctl->total_bitmaps * ctl->unit;
1705
1706 /*
1707 * we want the extent entry threshold to always be at most 1/2 the max
1708 * bytes we can have, or whatever is less than that.
1709 */
1710 extent_bytes = max_bytes - bitmap_bytes;
1711 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1712
1713 ctl->extents_thresh =
1714 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1715 }
1716
__bitmap_clear_bits(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,u64 offset,u64 bytes)1717 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1718 struct btrfs_free_space *info,
1719 u64 offset, u64 bytes)
1720 {
1721 unsigned long start, count, end;
1722 int extent_delta = -1;
1723
1724 start = offset_to_bit(info->offset, ctl->unit, offset);
1725 count = bytes_to_bits(bytes, ctl->unit);
1726 end = start + count;
1727 ASSERT(end <= BITS_PER_BITMAP);
1728
1729 bitmap_clear(info->bitmap, start, count);
1730
1731 info->bytes -= bytes;
1732 if (info->max_extent_size > ctl->unit)
1733 info->max_extent_size = 0;
1734
1735 if (start && test_bit(start - 1, info->bitmap))
1736 extent_delta++;
1737
1738 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1739 extent_delta++;
1740
1741 info->bitmap_extents += extent_delta;
1742 if (!btrfs_free_space_trimmed(info)) {
1743 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1744 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1745 }
1746 }
1747
bitmap_clear_bits(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,u64 offset,u64 bytes)1748 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1749 struct btrfs_free_space *info, u64 offset,
1750 u64 bytes)
1751 {
1752 __bitmap_clear_bits(ctl, info, offset, bytes);
1753 ctl->free_space -= bytes;
1754 }
1755
bitmap_set_bits(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,u64 offset,u64 bytes)1756 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1757 struct btrfs_free_space *info, u64 offset,
1758 u64 bytes)
1759 {
1760 unsigned long start, count, end;
1761 int extent_delta = 1;
1762
1763 start = offset_to_bit(info->offset, ctl->unit, offset);
1764 count = bytes_to_bits(bytes, ctl->unit);
1765 end = start + count;
1766 ASSERT(end <= BITS_PER_BITMAP);
1767
1768 bitmap_set(info->bitmap, start, count);
1769
1770 info->bytes += bytes;
1771 ctl->free_space += bytes;
1772
1773 if (start && test_bit(start - 1, info->bitmap))
1774 extent_delta--;
1775
1776 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1777 extent_delta--;
1778
1779 info->bitmap_extents += extent_delta;
1780 if (!btrfs_free_space_trimmed(info)) {
1781 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1782 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1783 }
1784 }
1785
1786 /*
1787 * If we can not find suitable extent, we will use bytes to record
1788 * the size of the max extent.
1789 */
search_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * bitmap_info,u64 * offset,u64 * bytes,bool for_alloc)1790 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1791 struct btrfs_free_space *bitmap_info, u64 *offset,
1792 u64 *bytes, bool for_alloc)
1793 {
1794 unsigned long found_bits = 0;
1795 unsigned long max_bits = 0;
1796 unsigned long bits, i;
1797 unsigned long next_zero;
1798 unsigned long extent_bits;
1799
1800 /*
1801 * Skip searching the bitmap if we don't have a contiguous section that
1802 * is large enough for this allocation.
1803 */
1804 if (for_alloc &&
1805 bitmap_info->max_extent_size &&
1806 bitmap_info->max_extent_size < *bytes) {
1807 *bytes = bitmap_info->max_extent_size;
1808 return -1;
1809 }
1810
1811 i = offset_to_bit(bitmap_info->offset, ctl->unit,
1812 max_t(u64, *offset, bitmap_info->offset));
1813 bits = bytes_to_bits(*bytes, ctl->unit);
1814
1815 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1816 if (for_alloc && bits == 1) {
1817 found_bits = 1;
1818 break;
1819 }
1820 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1821 BITS_PER_BITMAP, i);
1822 extent_bits = next_zero - i;
1823 if (extent_bits >= bits) {
1824 found_bits = extent_bits;
1825 break;
1826 } else if (extent_bits > max_bits) {
1827 max_bits = extent_bits;
1828 }
1829 i = next_zero;
1830 }
1831
1832 if (found_bits) {
1833 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1834 *bytes = (u64)(found_bits) * ctl->unit;
1835 return 0;
1836 }
1837
1838 *bytes = (u64)(max_bits) * ctl->unit;
1839 bitmap_info->max_extent_size = *bytes;
1840 return -1;
1841 }
1842
get_max_extent_size(struct btrfs_free_space * entry)1843 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1844 {
1845 if (entry->bitmap)
1846 return entry->max_extent_size;
1847 return entry->bytes;
1848 }
1849
1850 /* Cache the size of the max extent in bytes */
1851 static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl * ctl,u64 * offset,u64 * bytes,unsigned long align,u64 * max_extent_size)1852 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1853 unsigned long align, u64 *max_extent_size)
1854 {
1855 struct btrfs_free_space *entry;
1856 struct rb_node *node;
1857 u64 tmp;
1858 u64 align_off;
1859 int ret;
1860
1861 if (!ctl->free_space_offset.rb_node)
1862 goto out;
1863
1864 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1865 if (!entry)
1866 goto out;
1867
1868 for (node = &entry->offset_index; node; node = rb_next(node)) {
1869 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1870 if (entry->bytes < *bytes) {
1871 *max_extent_size = max(get_max_extent_size(entry),
1872 *max_extent_size);
1873 continue;
1874 }
1875
1876 /* make sure the space returned is big enough
1877 * to match our requested alignment
1878 */
1879 if (*bytes >= align) {
1880 tmp = entry->offset - ctl->start + align - 1;
1881 tmp = div64_u64(tmp, align);
1882 tmp = tmp * align + ctl->start;
1883 align_off = tmp - entry->offset;
1884 } else {
1885 align_off = 0;
1886 tmp = entry->offset;
1887 }
1888
1889 if (entry->bytes < *bytes + align_off) {
1890 *max_extent_size = max(get_max_extent_size(entry),
1891 *max_extent_size);
1892 continue;
1893 }
1894
1895 if (entry->bitmap) {
1896 u64 size = *bytes;
1897
1898 ret = search_bitmap(ctl, entry, &tmp, &size, true);
1899 if (!ret) {
1900 *offset = tmp;
1901 *bytes = size;
1902 return entry;
1903 } else {
1904 *max_extent_size =
1905 max(get_max_extent_size(entry),
1906 *max_extent_size);
1907 }
1908 continue;
1909 }
1910
1911 *offset = tmp;
1912 *bytes = entry->bytes - align_off;
1913 return entry;
1914 }
1915 out:
1916 return NULL;
1917 }
1918
count_bitmap_extents(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * bitmap_info)1919 static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
1920 struct btrfs_free_space *bitmap_info)
1921 {
1922 struct btrfs_block_group *block_group = ctl->private;
1923 u64 bytes = bitmap_info->bytes;
1924 unsigned int rs, re;
1925 int count = 0;
1926
1927 if (!block_group || !bytes)
1928 return count;
1929
1930 bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0,
1931 BITS_PER_BITMAP) {
1932 bytes -= (rs - re) * ctl->unit;
1933 count++;
1934
1935 if (!bytes)
1936 break;
1937 }
1938
1939 return count;
1940 }
1941
add_new_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,u64 offset)1942 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1943 struct btrfs_free_space *info, u64 offset)
1944 {
1945 info->offset = offset_to_bitmap(ctl, offset);
1946 info->bytes = 0;
1947 info->bitmap_extents = 0;
1948 INIT_LIST_HEAD(&info->list);
1949 link_free_space(ctl, info);
1950 ctl->total_bitmaps++;
1951
1952 ctl->op->recalc_thresholds(ctl);
1953 }
1954
free_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * bitmap_info)1955 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1956 struct btrfs_free_space *bitmap_info)
1957 {
1958 /*
1959 * Normally when this is called, the bitmap is completely empty. However,
1960 * if we are blowing up the free space cache for one reason or another
1961 * via __btrfs_remove_free_space_cache(), then it may not be freed and
1962 * we may leave stats on the table.
1963 */
1964 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
1965 ctl->discardable_extents[BTRFS_STAT_CURR] -=
1966 bitmap_info->bitmap_extents;
1967 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
1968
1969 }
1970 unlink_free_space(ctl, bitmap_info);
1971 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1972 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1973 ctl->total_bitmaps--;
1974 ctl->op->recalc_thresholds(ctl);
1975 }
1976
remove_from_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * bitmap_info,u64 * offset,u64 * bytes)1977 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1978 struct btrfs_free_space *bitmap_info,
1979 u64 *offset, u64 *bytes)
1980 {
1981 u64 end;
1982 u64 search_start, search_bytes;
1983 int ret;
1984
1985 again:
1986 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1987
1988 /*
1989 * We need to search for bits in this bitmap. We could only cover some
1990 * of the extent in this bitmap thanks to how we add space, so we need
1991 * to search for as much as it as we can and clear that amount, and then
1992 * go searching for the next bit.
1993 */
1994 search_start = *offset;
1995 search_bytes = ctl->unit;
1996 search_bytes = min(search_bytes, end - search_start + 1);
1997 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1998 false);
1999 if (ret < 0 || search_start != *offset)
2000 return -EINVAL;
2001
2002 /* We may have found more bits than what we need */
2003 search_bytes = min(search_bytes, *bytes);
2004
2005 /* Cannot clear past the end of the bitmap */
2006 search_bytes = min(search_bytes, end - search_start + 1);
2007
2008 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
2009 *offset += search_bytes;
2010 *bytes -= search_bytes;
2011
2012 if (*bytes) {
2013 struct rb_node *next = rb_next(&bitmap_info->offset_index);
2014 if (!bitmap_info->bytes)
2015 free_bitmap(ctl, bitmap_info);
2016
2017 /*
2018 * no entry after this bitmap, but we still have bytes to
2019 * remove, so something has gone wrong.
2020 */
2021 if (!next)
2022 return -EINVAL;
2023
2024 bitmap_info = rb_entry(next, struct btrfs_free_space,
2025 offset_index);
2026
2027 /*
2028 * if the next entry isn't a bitmap we need to return to let the
2029 * extent stuff do its work.
2030 */
2031 if (!bitmap_info->bitmap)
2032 return -EAGAIN;
2033
2034 /*
2035 * Ok the next item is a bitmap, but it may not actually hold
2036 * the information for the rest of this free space stuff, so
2037 * look for it, and if we don't find it return so we can try
2038 * everything over again.
2039 */
2040 search_start = *offset;
2041 search_bytes = ctl->unit;
2042 ret = search_bitmap(ctl, bitmap_info, &search_start,
2043 &search_bytes, false);
2044 if (ret < 0 || search_start != *offset)
2045 return -EAGAIN;
2046
2047 goto again;
2048 } else if (!bitmap_info->bytes)
2049 free_bitmap(ctl, bitmap_info);
2050
2051 return 0;
2052 }
2053
add_bytes_to_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,u64 offset,u64 bytes,enum btrfs_trim_state trim_state)2054 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2055 struct btrfs_free_space *info, u64 offset,
2056 u64 bytes, enum btrfs_trim_state trim_state)
2057 {
2058 u64 bytes_to_set = 0;
2059 u64 end;
2060
2061 /*
2062 * This is a tradeoff to make bitmap trim state minimal. We mark the
2063 * whole bitmap untrimmed if at any point we add untrimmed regions.
2064 */
2065 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2066 if (btrfs_free_space_trimmed(info)) {
2067 ctl->discardable_extents[BTRFS_STAT_CURR] +=
2068 info->bitmap_extents;
2069 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2070 }
2071 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2072 }
2073
2074 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2075
2076 bytes_to_set = min(end - offset, bytes);
2077
2078 bitmap_set_bits(ctl, info, offset, bytes_to_set);
2079
2080 /*
2081 * We set some bytes, we have no idea what the max extent size is
2082 * anymore.
2083 */
2084 info->max_extent_size = 0;
2085
2086 return bytes_to_set;
2087
2088 }
2089
use_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info)2090 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2091 struct btrfs_free_space *info)
2092 {
2093 struct btrfs_block_group *block_group = ctl->private;
2094 struct btrfs_fs_info *fs_info = block_group->fs_info;
2095 bool forced = false;
2096
2097 #ifdef CONFIG_BTRFS_DEBUG
2098 if (btrfs_should_fragment_free_space(block_group))
2099 forced = true;
2100 #endif
2101
2102 /* This is a way to reclaim large regions from the bitmaps. */
2103 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2104 return false;
2105
2106 /*
2107 * If we are below the extents threshold then we can add this as an
2108 * extent, and don't have to deal with the bitmap
2109 */
2110 if (!forced && ctl->free_extents < ctl->extents_thresh) {
2111 /*
2112 * If this block group has some small extents we don't want to
2113 * use up all of our free slots in the cache with them, we want
2114 * to reserve them to larger extents, however if we have plenty
2115 * of cache left then go ahead an dadd them, no sense in adding
2116 * the overhead of a bitmap if we don't have to.
2117 */
2118 if (info->bytes <= fs_info->sectorsize * 8) {
2119 if (ctl->free_extents * 3 <= ctl->extents_thresh)
2120 return false;
2121 } else {
2122 return false;
2123 }
2124 }
2125
2126 /*
2127 * The original block groups from mkfs can be really small, like 8
2128 * megabytes, so don't bother with a bitmap for those entries. However
2129 * some block groups can be smaller than what a bitmap would cover but
2130 * are still large enough that they could overflow the 32k memory limit,
2131 * so allow those block groups to still be allowed to have a bitmap
2132 * entry.
2133 */
2134 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2135 return false;
2136
2137 return true;
2138 }
2139
2140 static const struct btrfs_free_space_op free_space_op = {
2141 .recalc_thresholds = recalculate_thresholds,
2142 .use_bitmap = use_bitmap,
2143 };
2144
insert_into_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info)2145 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2146 struct btrfs_free_space *info)
2147 {
2148 struct btrfs_free_space *bitmap_info;
2149 struct btrfs_block_group *block_group = NULL;
2150 int added = 0;
2151 u64 bytes, offset, bytes_added;
2152 enum btrfs_trim_state trim_state;
2153 int ret;
2154
2155 bytes = info->bytes;
2156 offset = info->offset;
2157 trim_state = info->trim_state;
2158
2159 if (!ctl->op->use_bitmap(ctl, info))
2160 return 0;
2161
2162 if (ctl->op == &free_space_op)
2163 block_group = ctl->private;
2164 again:
2165 /*
2166 * Since we link bitmaps right into the cluster we need to see if we
2167 * have a cluster here, and if so and it has our bitmap we need to add
2168 * the free space to that bitmap.
2169 */
2170 if (block_group && !list_empty(&block_group->cluster_list)) {
2171 struct btrfs_free_cluster *cluster;
2172 struct rb_node *node;
2173 struct btrfs_free_space *entry;
2174
2175 cluster = list_entry(block_group->cluster_list.next,
2176 struct btrfs_free_cluster,
2177 block_group_list);
2178 spin_lock(&cluster->lock);
2179 node = rb_first(&cluster->root);
2180 if (!node) {
2181 spin_unlock(&cluster->lock);
2182 goto no_cluster_bitmap;
2183 }
2184
2185 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2186 if (!entry->bitmap) {
2187 spin_unlock(&cluster->lock);
2188 goto no_cluster_bitmap;
2189 }
2190
2191 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2192 bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2193 bytes, trim_state);
2194 bytes -= bytes_added;
2195 offset += bytes_added;
2196 }
2197 spin_unlock(&cluster->lock);
2198 if (!bytes) {
2199 ret = 1;
2200 goto out;
2201 }
2202 }
2203
2204 no_cluster_bitmap:
2205 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2206 1, 0);
2207 if (!bitmap_info) {
2208 ASSERT(added == 0);
2209 goto new_bitmap;
2210 }
2211
2212 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2213 trim_state);
2214 bytes -= bytes_added;
2215 offset += bytes_added;
2216 added = 0;
2217
2218 if (!bytes) {
2219 ret = 1;
2220 goto out;
2221 } else
2222 goto again;
2223
2224 new_bitmap:
2225 if (info && info->bitmap) {
2226 add_new_bitmap(ctl, info, offset);
2227 added = 1;
2228 info = NULL;
2229 goto again;
2230 } else {
2231 spin_unlock(&ctl->tree_lock);
2232
2233 /* no pre-allocated info, allocate a new one */
2234 if (!info) {
2235 info = kmem_cache_zalloc(btrfs_free_space_cachep,
2236 GFP_NOFS);
2237 if (!info) {
2238 spin_lock(&ctl->tree_lock);
2239 ret = -ENOMEM;
2240 goto out;
2241 }
2242 }
2243
2244 /* allocate the bitmap */
2245 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2246 GFP_NOFS);
2247 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2248 spin_lock(&ctl->tree_lock);
2249 if (!info->bitmap) {
2250 ret = -ENOMEM;
2251 goto out;
2252 }
2253 goto again;
2254 }
2255
2256 out:
2257 if (info) {
2258 if (info->bitmap)
2259 kmem_cache_free(btrfs_free_space_bitmap_cachep,
2260 info->bitmap);
2261 kmem_cache_free(btrfs_free_space_cachep, info);
2262 }
2263
2264 return ret;
2265 }
2266
2267 /*
2268 * Free space merging rules:
2269 * 1) Merge trimmed areas together
2270 * 2) Let untrimmed areas coalesce with trimmed areas
2271 * 3) Always pull neighboring regions from bitmaps
2272 *
2273 * The above rules are for when we merge free space based on btrfs_trim_state.
2274 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2275 * same reason: to promote larger extent regions which makes life easier for
2276 * find_free_extent(). Rule 2 enables coalescing based on the common path
2277 * being returning free space from btrfs_finish_extent_commit(). So when free
2278 * space is trimmed, it will prevent aggregating trimmed new region and
2279 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
2280 * and provide find_free_extent() with the largest extents possible hoping for
2281 * the reuse path.
2282 */
try_merge_free_space(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,bool update_stat)2283 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2284 struct btrfs_free_space *info, bool update_stat)
2285 {
2286 struct btrfs_free_space *left_info = NULL;
2287 struct btrfs_free_space *right_info;
2288 bool merged = false;
2289 u64 offset = info->offset;
2290 u64 bytes = info->bytes;
2291 const bool is_trimmed = btrfs_free_space_trimmed(info);
2292
2293 /*
2294 * first we want to see if there is free space adjacent to the range we
2295 * are adding, if there is remove that struct and add a new one to
2296 * cover the entire range
2297 */
2298 right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2299 if (right_info && rb_prev(&right_info->offset_index))
2300 left_info = rb_entry(rb_prev(&right_info->offset_index),
2301 struct btrfs_free_space, offset_index);
2302 else if (!right_info)
2303 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2304
2305 /* See try_merge_free_space() comment. */
2306 if (right_info && !right_info->bitmap &&
2307 (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2308 if (update_stat)
2309 unlink_free_space(ctl, right_info);
2310 else
2311 __unlink_free_space(ctl, right_info);
2312 info->bytes += right_info->bytes;
2313 kmem_cache_free(btrfs_free_space_cachep, right_info);
2314 merged = true;
2315 }
2316
2317 /* See try_merge_free_space() comment. */
2318 if (left_info && !left_info->bitmap &&
2319 left_info->offset + left_info->bytes == offset &&
2320 (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2321 if (update_stat)
2322 unlink_free_space(ctl, left_info);
2323 else
2324 __unlink_free_space(ctl, left_info);
2325 info->offset = left_info->offset;
2326 info->bytes += left_info->bytes;
2327 kmem_cache_free(btrfs_free_space_cachep, left_info);
2328 merged = true;
2329 }
2330
2331 return merged;
2332 }
2333
steal_from_bitmap_to_end(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,bool update_stat)2334 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2335 struct btrfs_free_space *info,
2336 bool update_stat)
2337 {
2338 struct btrfs_free_space *bitmap;
2339 unsigned long i;
2340 unsigned long j;
2341 const u64 end = info->offset + info->bytes;
2342 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2343 u64 bytes;
2344
2345 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2346 if (!bitmap)
2347 return false;
2348
2349 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2350 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2351 if (j == i)
2352 return false;
2353 bytes = (j - i) * ctl->unit;
2354 info->bytes += bytes;
2355
2356 /* See try_merge_free_space() comment. */
2357 if (!btrfs_free_space_trimmed(bitmap))
2358 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2359
2360 if (update_stat)
2361 bitmap_clear_bits(ctl, bitmap, end, bytes);
2362 else
2363 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2364
2365 if (!bitmap->bytes)
2366 free_bitmap(ctl, bitmap);
2367
2368 return true;
2369 }
2370
steal_from_bitmap_to_front(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,bool update_stat)2371 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2372 struct btrfs_free_space *info,
2373 bool update_stat)
2374 {
2375 struct btrfs_free_space *bitmap;
2376 u64 bitmap_offset;
2377 unsigned long i;
2378 unsigned long j;
2379 unsigned long prev_j;
2380 u64 bytes;
2381
2382 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2383 /* If we're on a boundary, try the previous logical bitmap. */
2384 if (bitmap_offset == info->offset) {
2385 if (info->offset == 0)
2386 return false;
2387 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2388 }
2389
2390 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2391 if (!bitmap)
2392 return false;
2393
2394 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2395 j = 0;
2396 prev_j = (unsigned long)-1;
2397 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2398 if (j > i)
2399 break;
2400 prev_j = j;
2401 }
2402 if (prev_j == i)
2403 return false;
2404
2405 if (prev_j == (unsigned long)-1)
2406 bytes = (i + 1) * ctl->unit;
2407 else
2408 bytes = (i - prev_j) * ctl->unit;
2409
2410 info->offset -= bytes;
2411 info->bytes += bytes;
2412
2413 /* See try_merge_free_space() comment. */
2414 if (!btrfs_free_space_trimmed(bitmap))
2415 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2416
2417 if (update_stat)
2418 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2419 else
2420 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2421
2422 if (!bitmap->bytes)
2423 free_bitmap(ctl, bitmap);
2424
2425 return true;
2426 }
2427
2428 /*
2429 * We prefer always to allocate from extent entries, both for clustered and
2430 * non-clustered allocation requests. So when attempting to add a new extent
2431 * entry, try to see if there's adjacent free space in bitmap entries, and if
2432 * there is, migrate that space from the bitmaps to the extent.
2433 * Like this we get better chances of satisfying space allocation requests
2434 * because we attempt to satisfy them based on a single cache entry, and never
2435 * on 2 or more entries - even if the entries represent a contiguous free space
2436 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2437 * ends).
2438 */
steal_from_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * info,bool update_stat)2439 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2440 struct btrfs_free_space *info,
2441 bool update_stat)
2442 {
2443 /*
2444 * Only work with disconnected entries, as we can change their offset,
2445 * and must be extent entries.
2446 */
2447 ASSERT(!info->bitmap);
2448 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2449
2450 if (ctl->total_bitmaps > 0) {
2451 bool stole_end;
2452 bool stole_front = false;
2453
2454 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2455 if (ctl->total_bitmaps > 0)
2456 stole_front = steal_from_bitmap_to_front(ctl, info,
2457 update_stat);
2458
2459 if (stole_end || stole_front)
2460 try_merge_free_space(ctl, info, update_stat);
2461 }
2462 }
2463
__btrfs_add_free_space(struct btrfs_fs_info * fs_info,struct btrfs_free_space_ctl * ctl,u64 offset,u64 bytes,enum btrfs_trim_state trim_state)2464 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2465 struct btrfs_free_space_ctl *ctl,
2466 u64 offset, u64 bytes,
2467 enum btrfs_trim_state trim_state)
2468 {
2469 struct btrfs_block_group *block_group = ctl->private;
2470 struct btrfs_free_space *info;
2471 int ret = 0;
2472 u64 filter_bytes = bytes;
2473
2474 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2475 if (!info)
2476 return -ENOMEM;
2477
2478 info->offset = offset;
2479 info->bytes = bytes;
2480 info->trim_state = trim_state;
2481 RB_CLEAR_NODE(&info->offset_index);
2482
2483 spin_lock(&ctl->tree_lock);
2484
2485 if (try_merge_free_space(ctl, info, true))
2486 goto link;
2487
2488 /*
2489 * There was no extent directly to the left or right of this new
2490 * extent then we know we're going to have to allocate a new extent, so
2491 * before we do that see if we need to drop this into a bitmap
2492 */
2493 ret = insert_into_bitmap(ctl, info);
2494 if (ret < 0) {
2495 goto out;
2496 } else if (ret) {
2497 ret = 0;
2498 goto out;
2499 }
2500 link:
2501 /*
2502 * Only steal free space from adjacent bitmaps if we're sure we're not
2503 * going to add the new free space to existing bitmap entries - because
2504 * that would mean unnecessary work that would be reverted. Therefore
2505 * attempt to steal space from bitmaps if we're adding an extent entry.
2506 */
2507 steal_from_bitmap(ctl, info, true);
2508
2509 filter_bytes = max(filter_bytes, info->bytes);
2510
2511 ret = link_free_space(ctl, info);
2512 if (ret)
2513 kmem_cache_free(btrfs_free_space_cachep, info);
2514 out:
2515 btrfs_discard_update_discardable(block_group, ctl);
2516 spin_unlock(&ctl->tree_lock);
2517
2518 if (ret) {
2519 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2520 ASSERT(ret != -EEXIST);
2521 }
2522
2523 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2524 btrfs_discard_check_filter(block_group, filter_bytes);
2525 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2526 }
2527
2528 return ret;
2529 }
2530
btrfs_add_free_space(struct btrfs_block_group * block_group,u64 bytenr,u64 size)2531 int btrfs_add_free_space(struct btrfs_block_group *block_group,
2532 u64 bytenr, u64 size)
2533 {
2534 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2535
2536 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2537 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2538
2539 return __btrfs_add_free_space(block_group->fs_info,
2540 block_group->free_space_ctl,
2541 bytenr, size, trim_state);
2542 }
2543
2544 /*
2545 * This is a subtle distinction because when adding free space back in general,
2546 * we want it to be added as untrimmed for async. But in the case where we add
2547 * it on loading of a block group, we want to consider it trimmed.
2548 */
btrfs_add_free_space_async_trimmed(struct btrfs_block_group * block_group,u64 bytenr,u64 size)2549 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2550 u64 bytenr, u64 size)
2551 {
2552 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2553
2554 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2555 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2556 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2557
2558 return __btrfs_add_free_space(block_group->fs_info,
2559 block_group->free_space_ctl,
2560 bytenr, size, trim_state);
2561 }
2562
btrfs_remove_free_space(struct btrfs_block_group * block_group,u64 offset,u64 bytes)2563 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2564 u64 offset, u64 bytes)
2565 {
2566 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2567 struct btrfs_free_space *info;
2568 int ret;
2569 bool re_search = false;
2570
2571 spin_lock(&ctl->tree_lock);
2572
2573 again:
2574 ret = 0;
2575 if (!bytes)
2576 goto out_lock;
2577
2578 info = tree_search_offset(ctl, offset, 0, 0);
2579 if (!info) {
2580 /*
2581 * oops didn't find an extent that matched the space we wanted
2582 * to remove, look for a bitmap instead
2583 */
2584 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2585 1, 0);
2586 if (!info) {
2587 /*
2588 * If we found a partial bit of our free space in a
2589 * bitmap but then couldn't find the other part this may
2590 * be a problem, so WARN about it.
2591 */
2592 WARN_ON(re_search);
2593 goto out_lock;
2594 }
2595 }
2596
2597 re_search = false;
2598 if (!info->bitmap) {
2599 unlink_free_space(ctl, info);
2600 if (offset == info->offset) {
2601 u64 to_free = min(bytes, info->bytes);
2602
2603 info->bytes -= to_free;
2604 info->offset += to_free;
2605 if (info->bytes) {
2606 ret = link_free_space(ctl, info);
2607 WARN_ON(ret);
2608 } else {
2609 kmem_cache_free(btrfs_free_space_cachep, info);
2610 }
2611
2612 offset += to_free;
2613 bytes -= to_free;
2614 goto again;
2615 } else {
2616 u64 old_end = info->bytes + info->offset;
2617
2618 info->bytes = offset - info->offset;
2619 ret = link_free_space(ctl, info);
2620 WARN_ON(ret);
2621 if (ret)
2622 goto out_lock;
2623
2624 /* Not enough bytes in this entry to satisfy us */
2625 if (old_end < offset + bytes) {
2626 bytes -= old_end - offset;
2627 offset = old_end;
2628 goto again;
2629 } else if (old_end == offset + bytes) {
2630 /* all done */
2631 goto out_lock;
2632 }
2633 spin_unlock(&ctl->tree_lock);
2634
2635 ret = __btrfs_add_free_space(block_group->fs_info, ctl,
2636 offset + bytes,
2637 old_end - (offset + bytes),
2638 info->trim_state);
2639 WARN_ON(ret);
2640 goto out;
2641 }
2642 }
2643
2644 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2645 if (ret == -EAGAIN) {
2646 re_search = true;
2647 goto again;
2648 }
2649 out_lock:
2650 btrfs_discard_update_discardable(block_group, ctl);
2651 spin_unlock(&ctl->tree_lock);
2652 out:
2653 return ret;
2654 }
2655
btrfs_dump_free_space(struct btrfs_block_group * block_group,u64 bytes)2656 void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2657 u64 bytes)
2658 {
2659 struct btrfs_fs_info *fs_info = block_group->fs_info;
2660 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2661 struct btrfs_free_space *info;
2662 struct rb_node *n;
2663 int count = 0;
2664
2665 spin_lock(&ctl->tree_lock);
2666 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2667 info = rb_entry(n, struct btrfs_free_space, offset_index);
2668 if (info->bytes >= bytes && !block_group->ro)
2669 count++;
2670 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2671 info->offset, info->bytes,
2672 (info->bitmap) ? "yes" : "no");
2673 }
2674 spin_unlock(&ctl->tree_lock);
2675 btrfs_info(fs_info, "block group has cluster?: %s",
2676 list_empty(&block_group->cluster_list) ? "no" : "yes");
2677 btrfs_info(fs_info,
2678 "%d blocks of free space at or bigger than bytes is", count);
2679 }
2680
btrfs_init_free_space_ctl(struct btrfs_block_group * block_group)2681 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
2682 {
2683 struct btrfs_fs_info *fs_info = block_group->fs_info;
2684 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2685
2686 spin_lock_init(&ctl->tree_lock);
2687 ctl->unit = fs_info->sectorsize;
2688 ctl->start = block_group->start;
2689 ctl->private = block_group;
2690 ctl->op = &free_space_op;
2691 INIT_LIST_HEAD(&ctl->trimming_ranges);
2692 mutex_init(&ctl->cache_writeout_mutex);
2693
2694 /*
2695 * we only want to have 32k of ram per block group for keeping
2696 * track of free space, and if we pass 1/2 of that we want to
2697 * start converting things over to using bitmaps
2698 */
2699 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2700 }
2701
2702 /*
2703 * for a given cluster, put all of its extents back into the free
2704 * space cache. If the block group passed doesn't match the block group
2705 * pointed to by the cluster, someone else raced in and freed the
2706 * cluster already. In that case, we just return without changing anything
2707 */
__btrfs_return_cluster_to_free_space(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster)2708 static void __btrfs_return_cluster_to_free_space(
2709 struct btrfs_block_group *block_group,
2710 struct btrfs_free_cluster *cluster)
2711 {
2712 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2713 struct btrfs_free_space *entry;
2714 struct rb_node *node;
2715
2716 spin_lock(&cluster->lock);
2717 if (cluster->block_group != block_group) {
2718 spin_unlock(&cluster->lock);
2719 return;
2720 }
2721
2722 cluster->block_group = NULL;
2723 cluster->window_start = 0;
2724 list_del_init(&cluster->block_group_list);
2725
2726 node = rb_first(&cluster->root);
2727 while (node) {
2728 bool bitmap;
2729
2730 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2731 node = rb_next(&entry->offset_index);
2732 rb_erase(&entry->offset_index, &cluster->root);
2733 RB_CLEAR_NODE(&entry->offset_index);
2734
2735 bitmap = (entry->bitmap != NULL);
2736 if (!bitmap) {
2737 /* Merging treats extents as if they were new */
2738 if (!btrfs_free_space_trimmed(entry)) {
2739 ctl->discardable_extents[BTRFS_STAT_CURR]--;
2740 ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2741 entry->bytes;
2742 }
2743
2744 try_merge_free_space(ctl, entry, false);
2745 steal_from_bitmap(ctl, entry, false);
2746
2747 /* As we insert directly, update these statistics */
2748 if (!btrfs_free_space_trimmed(entry)) {
2749 ctl->discardable_extents[BTRFS_STAT_CURR]++;
2750 ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2751 entry->bytes;
2752 }
2753 }
2754 tree_insert_offset(&ctl->free_space_offset,
2755 entry->offset, &entry->offset_index, bitmap);
2756 }
2757 cluster->root = RB_ROOT;
2758 spin_unlock(&cluster->lock);
2759 btrfs_put_block_group(block_group);
2760 }
2761
__btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl * ctl)2762 static void __btrfs_remove_free_space_cache_locked(
2763 struct btrfs_free_space_ctl *ctl)
2764 {
2765 struct btrfs_free_space *info;
2766 struct rb_node *node;
2767
2768 while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2769 info = rb_entry(node, struct btrfs_free_space, offset_index);
2770 if (!info->bitmap) {
2771 unlink_free_space(ctl, info);
2772 kmem_cache_free(btrfs_free_space_cachep, info);
2773 } else {
2774 free_bitmap(ctl, info);
2775 }
2776
2777 cond_resched_lock(&ctl->tree_lock);
2778 }
2779 }
2780
__btrfs_remove_free_space_cache(struct btrfs_free_space_ctl * ctl)2781 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2782 {
2783 spin_lock(&ctl->tree_lock);
2784 __btrfs_remove_free_space_cache_locked(ctl);
2785 if (ctl->private)
2786 btrfs_discard_update_discardable(ctl->private, ctl);
2787 spin_unlock(&ctl->tree_lock);
2788 }
2789
btrfs_remove_free_space_cache(struct btrfs_block_group * block_group)2790 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2791 {
2792 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2793 struct btrfs_free_cluster *cluster;
2794 struct list_head *head;
2795
2796 spin_lock(&ctl->tree_lock);
2797 while ((head = block_group->cluster_list.next) !=
2798 &block_group->cluster_list) {
2799 cluster = list_entry(head, struct btrfs_free_cluster,
2800 block_group_list);
2801
2802 WARN_ON(cluster->block_group != block_group);
2803 __btrfs_return_cluster_to_free_space(block_group, cluster);
2804
2805 cond_resched_lock(&ctl->tree_lock);
2806 }
2807 __btrfs_remove_free_space_cache_locked(ctl);
2808 btrfs_discard_update_discardable(block_group, ctl);
2809 spin_unlock(&ctl->tree_lock);
2810
2811 }
2812
2813 /**
2814 * btrfs_is_free_space_trimmed - see if everything is trimmed
2815 * @block_group: block_group of interest
2816 *
2817 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
2818 */
btrfs_is_free_space_trimmed(struct btrfs_block_group * block_group)2819 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
2820 {
2821 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2822 struct btrfs_free_space *info;
2823 struct rb_node *node;
2824 bool ret = true;
2825
2826 spin_lock(&ctl->tree_lock);
2827 node = rb_first(&ctl->free_space_offset);
2828
2829 while (node) {
2830 info = rb_entry(node, struct btrfs_free_space, offset_index);
2831
2832 if (!btrfs_free_space_trimmed(info)) {
2833 ret = false;
2834 break;
2835 }
2836
2837 node = rb_next(node);
2838 }
2839
2840 spin_unlock(&ctl->tree_lock);
2841 return ret;
2842 }
2843
btrfs_find_space_for_alloc(struct btrfs_block_group * block_group,u64 offset,u64 bytes,u64 empty_size,u64 * max_extent_size)2844 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2845 u64 offset, u64 bytes, u64 empty_size,
2846 u64 *max_extent_size)
2847 {
2848 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2849 struct btrfs_discard_ctl *discard_ctl =
2850 &block_group->fs_info->discard_ctl;
2851 struct btrfs_free_space *entry = NULL;
2852 u64 bytes_search = bytes + empty_size;
2853 u64 ret = 0;
2854 u64 align_gap = 0;
2855 u64 align_gap_len = 0;
2856 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2857
2858 spin_lock(&ctl->tree_lock);
2859 entry = find_free_space(ctl, &offset, &bytes_search,
2860 block_group->full_stripe_len, max_extent_size);
2861 if (!entry)
2862 goto out;
2863
2864 ret = offset;
2865 if (entry->bitmap) {
2866 bitmap_clear_bits(ctl, entry, offset, bytes);
2867
2868 if (!btrfs_free_space_trimmed(entry))
2869 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2870
2871 if (!entry->bytes)
2872 free_bitmap(ctl, entry);
2873 } else {
2874 unlink_free_space(ctl, entry);
2875 align_gap_len = offset - entry->offset;
2876 align_gap = entry->offset;
2877 align_gap_trim_state = entry->trim_state;
2878
2879 if (!btrfs_free_space_trimmed(entry))
2880 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2881
2882 entry->offset = offset + bytes;
2883 WARN_ON(entry->bytes < bytes + align_gap_len);
2884
2885 entry->bytes -= bytes + align_gap_len;
2886 if (!entry->bytes)
2887 kmem_cache_free(btrfs_free_space_cachep, entry);
2888 else
2889 link_free_space(ctl, entry);
2890 }
2891 out:
2892 btrfs_discard_update_discardable(block_group, ctl);
2893 spin_unlock(&ctl->tree_lock);
2894
2895 if (align_gap_len)
2896 __btrfs_add_free_space(block_group->fs_info, ctl,
2897 align_gap, align_gap_len,
2898 align_gap_trim_state);
2899 return ret;
2900 }
2901
2902 /*
2903 * given a cluster, put all of its extents back into the free space
2904 * cache. If a block group is passed, this function will only free
2905 * a cluster that belongs to the passed block group.
2906 *
2907 * Otherwise, it'll get a reference on the block group pointed to by the
2908 * cluster and remove the cluster from it.
2909 */
btrfs_return_cluster_to_free_space(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster)2910 void btrfs_return_cluster_to_free_space(
2911 struct btrfs_block_group *block_group,
2912 struct btrfs_free_cluster *cluster)
2913 {
2914 struct btrfs_free_space_ctl *ctl;
2915
2916 /* first, get a safe pointer to the block group */
2917 spin_lock(&cluster->lock);
2918 if (!block_group) {
2919 block_group = cluster->block_group;
2920 if (!block_group) {
2921 spin_unlock(&cluster->lock);
2922 return;
2923 }
2924 } else if (cluster->block_group != block_group) {
2925 /* someone else has already freed it don't redo their work */
2926 spin_unlock(&cluster->lock);
2927 return;
2928 }
2929 btrfs_get_block_group(block_group);
2930 spin_unlock(&cluster->lock);
2931
2932 ctl = block_group->free_space_ctl;
2933
2934 /* now return any extents the cluster had on it */
2935 spin_lock(&ctl->tree_lock);
2936 __btrfs_return_cluster_to_free_space(block_group, cluster);
2937 spin_unlock(&ctl->tree_lock);
2938
2939 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
2940
2941 /* finally drop our ref */
2942 btrfs_put_block_group(block_group);
2943 }
2944
btrfs_alloc_from_bitmap(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster,struct btrfs_free_space * entry,u64 bytes,u64 min_start,u64 * max_extent_size)2945 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
2946 struct btrfs_free_cluster *cluster,
2947 struct btrfs_free_space *entry,
2948 u64 bytes, u64 min_start,
2949 u64 *max_extent_size)
2950 {
2951 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2952 int err;
2953 u64 search_start = cluster->window_start;
2954 u64 search_bytes = bytes;
2955 u64 ret = 0;
2956
2957 search_start = min_start;
2958 search_bytes = bytes;
2959
2960 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2961 if (err) {
2962 *max_extent_size = max(get_max_extent_size(entry),
2963 *max_extent_size);
2964 return 0;
2965 }
2966
2967 ret = search_start;
2968 __bitmap_clear_bits(ctl, entry, ret, bytes);
2969
2970 return ret;
2971 }
2972
2973 /*
2974 * given a cluster, try to allocate 'bytes' from it, returns 0
2975 * if it couldn't find anything suitably large, or a logical disk offset
2976 * if things worked out
2977 */
btrfs_alloc_from_cluster(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster,u64 bytes,u64 min_start,u64 * max_extent_size)2978 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
2979 struct btrfs_free_cluster *cluster, u64 bytes,
2980 u64 min_start, u64 *max_extent_size)
2981 {
2982 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2983 struct btrfs_discard_ctl *discard_ctl =
2984 &block_group->fs_info->discard_ctl;
2985 struct btrfs_free_space *entry = NULL;
2986 struct rb_node *node;
2987 u64 ret = 0;
2988
2989 spin_lock(&cluster->lock);
2990 if (bytes > cluster->max_size)
2991 goto out;
2992
2993 if (cluster->block_group != block_group)
2994 goto out;
2995
2996 node = rb_first(&cluster->root);
2997 if (!node)
2998 goto out;
2999
3000 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3001 while (1) {
3002 if (entry->bytes < bytes)
3003 *max_extent_size = max(get_max_extent_size(entry),
3004 *max_extent_size);
3005
3006 if (entry->bytes < bytes ||
3007 (!entry->bitmap && entry->offset < min_start)) {
3008 node = rb_next(&entry->offset_index);
3009 if (!node)
3010 break;
3011 entry = rb_entry(node, struct btrfs_free_space,
3012 offset_index);
3013 continue;
3014 }
3015
3016 if (entry->bitmap) {
3017 ret = btrfs_alloc_from_bitmap(block_group,
3018 cluster, entry, bytes,
3019 cluster->window_start,
3020 max_extent_size);
3021 if (ret == 0) {
3022 node = rb_next(&entry->offset_index);
3023 if (!node)
3024 break;
3025 entry = rb_entry(node, struct btrfs_free_space,
3026 offset_index);
3027 continue;
3028 }
3029 cluster->window_start += bytes;
3030 } else {
3031 ret = entry->offset;
3032
3033 entry->offset += bytes;
3034 entry->bytes -= bytes;
3035 }
3036
3037 break;
3038 }
3039 out:
3040 spin_unlock(&cluster->lock);
3041
3042 if (!ret)
3043 return 0;
3044
3045 spin_lock(&ctl->tree_lock);
3046
3047 if (!btrfs_free_space_trimmed(entry))
3048 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3049
3050 ctl->free_space -= bytes;
3051 if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3052 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3053
3054 spin_lock(&cluster->lock);
3055 if (entry->bytes == 0) {
3056 rb_erase(&entry->offset_index, &cluster->root);
3057 ctl->free_extents--;
3058 if (entry->bitmap) {
3059 kmem_cache_free(btrfs_free_space_bitmap_cachep,
3060 entry->bitmap);
3061 ctl->total_bitmaps--;
3062 ctl->op->recalc_thresholds(ctl);
3063 } else if (!btrfs_free_space_trimmed(entry)) {
3064 ctl->discardable_extents[BTRFS_STAT_CURR]--;
3065 }
3066 kmem_cache_free(btrfs_free_space_cachep, entry);
3067 }
3068
3069 spin_unlock(&cluster->lock);
3070 spin_unlock(&ctl->tree_lock);
3071
3072 return ret;
3073 }
3074
btrfs_bitmap_cluster(struct btrfs_block_group * block_group,struct btrfs_free_space * entry,struct btrfs_free_cluster * cluster,u64 offset,u64 bytes,u64 cont1_bytes,u64 min_bytes)3075 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3076 struct btrfs_free_space *entry,
3077 struct btrfs_free_cluster *cluster,
3078 u64 offset, u64 bytes,
3079 u64 cont1_bytes, u64 min_bytes)
3080 {
3081 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3082 unsigned long next_zero;
3083 unsigned long i;
3084 unsigned long want_bits;
3085 unsigned long min_bits;
3086 unsigned long found_bits;
3087 unsigned long max_bits = 0;
3088 unsigned long start = 0;
3089 unsigned long total_found = 0;
3090 int ret;
3091
3092 i = offset_to_bit(entry->offset, ctl->unit,
3093 max_t(u64, offset, entry->offset));
3094 want_bits = bytes_to_bits(bytes, ctl->unit);
3095 min_bits = bytes_to_bits(min_bytes, ctl->unit);
3096
3097 /*
3098 * Don't bother looking for a cluster in this bitmap if it's heavily
3099 * fragmented.
3100 */
3101 if (entry->max_extent_size &&
3102 entry->max_extent_size < cont1_bytes)
3103 return -ENOSPC;
3104 again:
3105 found_bits = 0;
3106 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3107 next_zero = find_next_zero_bit(entry->bitmap,
3108 BITS_PER_BITMAP, i);
3109 if (next_zero - i >= min_bits) {
3110 found_bits = next_zero - i;
3111 if (found_bits > max_bits)
3112 max_bits = found_bits;
3113 break;
3114 }
3115 if (next_zero - i > max_bits)
3116 max_bits = next_zero - i;
3117 i = next_zero;
3118 }
3119
3120 if (!found_bits) {
3121 entry->max_extent_size = (u64)max_bits * ctl->unit;
3122 return -ENOSPC;
3123 }
3124
3125 if (!total_found) {
3126 start = i;
3127 cluster->max_size = 0;
3128 }
3129
3130 total_found += found_bits;
3131
3132 if (cluster->max_size < found_bits * ctl->unit)
3133 cluster->max_size = found_bits * ctl->unit;
3134
3135 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3136 i = next_zero + 1;
3137 goto again;
3138 }
3139
3140 cluster->window_start = start * ctl->unit + entry->offset;
3141 rb_erase(&entry->offset_index, &ctl->free_space_offset);
3142 ret = tree_insert_offset(&cluster->root, entry->offset,
3143 &entry->offset_index, 1);
3144 ASSERT(!ret); /* -EEXIST; Logic error */
3145
3146 trace_btrfs_setup_cluster(block_group, cluster,
3147 total_found * ctl->unit, 1);
3148 return 0;
3149 }
3150
3151 /*
3152 * This searches the block group for just extents to fill the cluster with.
3153 * Try to find a cluster with at least bytes total bytes, at least one
3154 * extent of cont1_bytes, and other clusters of at least min_bytes.
3155 */
3156 static noinline int
setup_cluster_no_bitmap(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster,struct list_head * bitmaps,u64 offset,u64 bytes,u64 cont1_bytes,u64 min_bytes)3157 setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3158 struct btrfs_free_cluster *cluster,
3159 struct list_head *bitmaps, u64 offset, u64 bytes,
3160 u64 cont1_bytes, u64 min_bytes)
3161 {
3162 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3163 struct btrfs_free_space *first = NULL;
3164 struct btrfs_free_space *entry = NULL;
3165 struct btrfs_free_space *last;
3166 struct rb_node *node;
3167 u64 window_free;
3168 u64 max_extent;
3169 u64 total_size = 0;
3170
3171 entry = tree_search_offset(ctl, offset, 0, 1);
3172 if (!entry)
3173 return -ENOSPC;
3174
3175 /*
3176 * We don't want bitmaps, so just move along until we find a normal
3177 * extent entry.
3178 */
3179 while (entry->bitmap || entry->bytes < min_bytes) {
3180 if (entry->bitmap && list_empty(&entry->list))
3181 list_add_tail(&entry->list, bitmaps);
3182 node = rb_next(&entry->offset_index);
3183 if (!node)
3184 return -ENOSPC;
3185 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3186 }
3187
3188 window_free = entry->bytes;
3189 max_extent = entry->bytes;
3190 first = entry;
3191 last = entry;
3192
3193 for (node = rb_next(&entry->offset_index); node;
3194 node = rb_next(&entry->offset_index)) {
3195 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3196
3197 if (entry->bitmap) {
3198 if (list_empty(&entry->list))
3199 list_add_tail(&entry->list, bitmaps);
3200 continue;
3201 }
3202
3203 if (entry->bytes < min_bytes)
3204 continue;
3205
3206 last = entry;
3207 window_free += entry->bytes;
3208 if (entry->bytes > max_extent)
3209 max_extent = entry->bytes;
3210 }
3211
3212 if (window_free < bytes || max_extent < cont1_bytes)
3213 return -ENOSPC;
3214
3215 cluster->window_start = first->offset;
3216
3217 node = &first->offset_index;
3218
3219 /*
3220 * now we've found our entries, pull them out of the free space
3221 * cache and put them into the cluster rbtree
3222 */
3223 do {
3224 int ret;
3225
3226 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3227 node = rb_next(&entry->offset_index);
3228 if (entry->bitmap || entry->bytes < min_bytes)
3229 continue;
3230
3231 rb_erase(&entry->offset_index, &ctl->free_space_offset);
3232 ret = tree_insert_offset(&cluster->root, entry->offset,
3233 &entry->offset_index, 0);
3234 total_size += entry->bytes;
3235 ASSERT(!ret); /* -EEXIST; Logic error */
3236 } while (node && entry != last);
3237
3238 cluster->max_size = max_extent;
3239 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3240 return 0;
3241 }
3242
3243 /*
3244 * This specifically looks for bitmaps that may work in the cluster, we assume
3245 * that we have already failed to find extents that will work.
3246 */
3247 static noinline int
setup_cluster_bitmap(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster,struct list_head * bitmaps,u64 offset,u64 bytes,u64 cont1_bytes,u64 min_bytes)3248 setup_cluster_bitmap(struct btrfs_block_group *block_group,
3249 struct btrfs_free_cluster *cluster,
3250 struct list_head *bitmaps, u64 offset, u64 bytes,
3251 u64 cont1_bytes, u64 min_bytes)
3252 {
3253 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3254 struct btrfs_free_space *entry = NULL;
3255 int ret = -ENOSPC;
3256 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3257
3258 if (ctl->total_bitmaps == 0)
3259 return -ENOSPC;
3260
3261 /*
3262 * The bitmap that covers offset won't be in the list unless offset
3263 * is just its start offset.
3264 */
3265 if (!list_empty(bitmaps))
3266 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3267
3268 if (!entry || entry->offset != bitmap_offset) {
3269 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3270 if (entry && list_empty(&entry->list))
3271 list_add(&entry->list, bitmaps);
3272 }
3273
3274 list_for_each_entry(entry, bitmaps, list) {
3275 if (entry->bytes < bytes)
3276 continue;
3277 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3278 bytes, cont1_bytes, min_bytes);
3279 if (!ret)
3280 return 0;
3281 }
3282
3283 /*
3284 * The bitmaps list has all the bitmaps that record free space
3285 * starting after offset, so no more search is required.
3286 */
3287 return -ENOSPC;
3288 }
3289
3290 /*
3291 * here we try to find a cluster of blocks in a block group. The goal
3292 * is to find at least bytes+empty_size.
3293 * We might not find them all in one contiguous area.
3294 *
3295 * returns zero and sets up cluster if things worked out, otherwise
3296 * it returns -enospc
3297 */
btrfs_find_space_cluster(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster,u64 offset,u64 bytes,u64 empty_size)3298 int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3299 struct btrfs_free_cluster *cluster,
3300 u64 offset, u64 bytes, u64 empty_size)
3301 {
3302 struct btrfs_fs_info *fs_info = block_group->fs_info;
3303 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3304 struct btrfs_free_space *entry, *tmp;
3305 LIST_HEAD(bitmaps);
3306 u64 min_bytes;
3307 u64 cont1_bytes;
3308 int ret;
3309
3310 /*
3311 * Choose the minimum extent size we'll require for this
3312 * cluster. For SSD_SPREAD, don't allow any fragmentation.
3313 * For metadata, allow allocates with smaller extents. For
3314 * data, keep it dense.
3315 */
3316 if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3317 cont1_bytes = min_bytes = bytes + empty_size;
3318 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3319 cont1_bytes = bytes;
3320 min_bytes = fs_info->sectorsize;
3321 } else {
3322 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3323 min_bytes = fs_info->sectorsize;
3324 }
3325
3326 spin_lock(&ctl->tree_lock);
3327
3328 /*
3329 * If we know we don't have enough space to make a cluster don't even
3330 * bother doing all the work to try and find one.
3331 */
3332 if (ctl->free_space < bytes) {
3333 spin_unlock(&ctl->tree_lock);
3334 return -ENOSPC;
3335 }
3336
3337 spin_lock(&cluster->lock);
3338
3339 /* someone already found a cluster, hooray */
3340 if (cluster->block_group) {
3341 ret = 0;
3342 goto out;
3343 }
3344
3345 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3346 min_bytes);
3347
3348 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3349 bytes + empty_size,
3350 cont1_bytes, min_bytes);
3351 if (ret)
3352 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3353 offset, bytes + empty_size,
3354 cont1_bytes, min_bytes);
3355
3356 /* Clear our temporary list */
3357 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3358 list_del_init(&entry->list);
3359
3360 if (!ret) {
3361 btrfs_get_block_group(block_group);
3362 list_add_tail(&cluster->block_group_list,
3363 &block_group->cluster_list);
3364 cluster->block_group = block_group;
3365 } else {
3366 trace_btrfs_failed_cluster_setup(block_group);
3367 }
3368 out:
3369 spin_unlock(&cluster->lock);
3370 spin_unlock(&ctl->tree_lock);
3371
3372 return ret;
3373 }
3374
3375 /*
3376 * simple code to zero out a cluster
3377 */
btrfs_init_free_cluster(struct btrfs_free_cluster * cluster)3378 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3379 {
3380 spin_lock_init(&cluster->lock);
3381 spin_lock_init(&cluster->refill_lock);
3382 cluster->root = RB_ROOT;
3383 cluster->max_size = 0;
3384 cluster->fragmented = false;
3385 INIT_LIST_HEAD(&cluster->block_group_list);
3386 cluster->block_group = NULL;
3387 }
3388
do_trimming(struct btrfs_block_group * block_group,u64 * total_trimmed,u64 start,u64 bytes,u64 reserved_start,u64 reserved_bytes,enum btrfs_trim_state reserved_trim_state,struct btrfs_trim_range * trim_entry)3389 static int do_trimming(struct btrfs_block_group *block_group,
3390 u64 *total_trimmed, u64 start, u64 bytes,
3391 u64 reserved_start, u64 reserved_bytes,
3392 enum btrfs_trim_state reserved_trim_state,
3393 struct btrfs_trim_range *trim_entry)
3394 {
3395 struct btrfs_space_info *space_info = block_group->space_info;
3396 struct btrfs_fs_info *fs_info = block_group->fs_info;
3397 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3398 int ret;
3399 int update = 0;
3400 const u64 end = start + bytes;
3401 const u64 reserved_end = reserved_start + reserved_bytes;
3402 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3403 u64 trimmed = 0;
3404
3405 spin_lock(&space_info->lock);
3406 spin_lock(&block_group->lock);
3407 if (!block_group->ro) {
3408 block_group->reserved += reserved_bytes;
3409 space_info->bytes_reserved += reserved_bytes;
3410 update = 1;
3411 }
3412 spin_unlock(&block_group->lock);
3413 spin_unlock(&space_info->lock);
3414
3415 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3416 if (!ret) {
3417 *total_trimmed += trimmed;
3418 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3419 }
3420
3421 mutex_lock(&ctl->cache_writeout_mutex);
3422 if (reserved_start < start)
3423 __btrfs_add_free_space(fs_info, ctl, reserved_start,
3424 start - reserved_start,
3425 reserved_trim_state);
3426 if (start + bytes < reserved_start + reserved_bytes)
3427 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
3428 reserved_trim_state);
3429 __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3430 list_del(&trim_entry->list);
3431 mutex_unlock(&ctl->cache_writeout_mutex);
3432
3433 if (update) {
3434 spin_lock(&space_info->lock);
3435 spin_lock(&block_group->lock);
3436 if (block_group->ro)
3437 space_info->bytes_readonly += reserved_bytes;
3438 block_group->reserved -= reserved_bytes;
3439 space_info->bytes_reserved -= reserved_bytes;
3440 spin_unlock(&block_group->lock);
3441 spin_unlock(&space_info->lock);
3442 }
3443
3444 return ret;
3445 }
3446
3447 /*
3448 * If @async is set, then we will trim 1 region and return.
3449 */
trim_no_bitmap(struct btrfs_block_group * block_group,u64 * total_trimmed,u64 start,u64 end,u64 minlen,bool async)3450 static int trim_no_bitmap(struct btrfs_block_group *block_group,
3451 u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3452 bool async)
3453 {
3454 struct btrfs_discard_ctl *discard_ctl =
3455 &block_group->fs_info->discard_ctl;
3456 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3457 struct btrfs_free_space *entry;
3458 struct rb_node *node;
3459 int ret = 0;
3460 u64 extent_start;
3461 u64 extent_bytes;
3462 enum btrfs_trim_state extent_trim_state;
3463 u64 bytes;
3464 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3465
3466 while (start < end) {
3467 struct btrfs_trim_range trim_entry;
3468
3469 mutex_lock(&ctl->cache_writeout_mutex);
3470 spin_lock(&ctl->tree_lock);
3471
3472 if (ctl->free_space < minlen)
3473 goto out_unlock;
3474
3475 entry = tree_search_offset(ctl, start, 0, 1);
3476 if (!entry)
3477 goto out_unlock;
3478
3479 /* Skip bitmaps and if async, already trimmed entries */
3480 while (entry->bitmap ||
3481 (async && btrfs_free_space_trimmed(entry))) {
3482 node = rb_next(&entry->offset_index);
3483 if (!node)
3484 goto out_unlock;
3485 entry = rb_entry(node, struct btrfs_free_space,
3486 offset_index);
3487 }
3488
3489 if (entry->offset >= end)
3490 goto out_unlock;
3491
3492 extent_start = entry->offset;
3493 extent_bytes = entry->bytes;
3494 extent_trim_state = entry->trim_state;
3495 if (async) {
3496 start = entry->offset;
3497 bytes = entry->bytes;
3498 if (bytes < minlen) {
3499 spin_unlock(&ctl->tree_lock);
3500 mutex_unlock(&ctl->cache_writeout_mutex);
3501 goto next;
3502 }
3503 unlink_free_space(ctl, entry);
3504 /*
3505 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3506 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3507 * X when we come back around. So trim it now.
3508 */
3509 if (max_discard_size &&
3510 bytes >= (max_discard_size +
3511 BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3512 bytes = max_discard_size;
3513 extent_bytes = max_discard_size;
3514 entry->offset += max_discard_size;
3515 entry->bytes -= max_discard_size;
3516 link_free_space(ctl, entry);
3517 } else {
3518 kmem_cache_free(btrfs_free_space_cachep, entry);
3519 }
3520 } else {
3521 start = max(start, extent_start);
3522 bytes = min(extent_start + extent_bytes, end) - start;
3523 if (bytes < minlen) {
3524 spin_unlock(&ctl->tree_lock);
3525 mutex_unlock(&ctl->cache_writeout_mutex);
3526 goto next;
3527 }
3528
3529 unlink_free_space(ctl, entry);
3530 kmem_cache_free(btrfs_free_space_cachep, entry);
3531 }
3532
3533 spin_unlock(&ctl->tree_lock);
3534 trim_entry.start = extent_start;
3535 trim_entry.bytes = extent_bytes;
3536 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3537 mutex_unlock(&ctl->cache_writeout_mutex);
3538
3539 ret = do_trimming(block_group, total_trimmed, start, bytes,
3540 extent_start, extent_bytes, extent_trim_state,
3541 &trim_entry);
3542 if (ret) {
3543 block_group->discard_cursor = start + bytes;
3544 break;
3545 }
3546 next:
3547 start += bytes;
3548 block_group->discard_cursor = start;
3549 if (async && *total_trimmed)
3550 break;
3551
3552 if (fatal_signal_pending(current)) {
3553 ret = -ERESTARTSYS;
3554 break;
3555 }
3556
3557 cond_resched();
3558 }
3559
3560 return ret;
3561
3562 out_unlock:
3563 block_group->discard_cursor = btrfs_block_group_end(block_group);
3564 spin_unlock(&ctl->tree_lock);
3565 mutex_unlock(&ctl->cache_writeout_mutex);
3566
3567 return ret;
3568 }
3569
3570 /*
3571 * If we break out of trimming a bitmap prematurely, we should reset the
3572 * trimming bit. In a rather contrieved case, it's possible to race here so
3573 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3574 *
3575 * start = start of bitmap
3576 * end = near end of bitmap
3577 *
3578 * Thread 1: Thread 2:
3579 * trim_bitmaps(start)
3580 * trim_bitmaps(end)
3581 * end_trimming_bitmap()
3582 * reset_trimming_bitmap()
3583 */
reset_trimming_bitmap(struct btrfs_free_space_ctl * ctl,u64 offset)3584 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3585 {
3586 struct btrfs_free_space *entry;
3587
3588 spin_lock(&ctl->tree_lock);
3589 entry = tree_search_offset(ctl, offset, 1, 0);
3590 if (entry) {
3591 if (btrfs_free_space_trimmed(entry)) {
3592 ctl->discardable_extents[BTRFS_STAT_CURR] +=
3593 entry->bitmap_extents;
3594 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3595 }
3596 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3597 }
3598
3599 spin_unlock(&ctl->tree_lock);
3600 }
3601
end_trimming_bitmap(struct btrfs_free_space_ctl * ctl,struct btrfs_free_space * entry)3602 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3603 struct btrfs_free_space *entry)
3604 {
3605 if (btrfs_free_space_trimming_bitmap(entry)) {
3606 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3607 ctl->discardable_extents[BTRFS_STAT_CURR] -=
3608 entry->bitmap_extents;
3609 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3610 }
3611 }
3612
3613 /*
3614 * If @async is set, then we will trim 1 region and return.
3615 */
trim_bitmaps(struct btrfs_block_group * block_group,u64 * total_trimmed,u64 start,u64 end,u64 minlen,u64 maxlen,bool async)3616 static int trim_bitmaps(struct btrfs_block_group *block_group,
3617 u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3618 u64 maxlen, bool async)
3619 {
3620 struct btrfs_discard_ctl *discard_ctl =
3621 &block_group->fs_info->discard_ctl;
3622 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3623 struct btrfs_free_space *entry;
3624 int ret = 0;
3625 int ret2;
3626 u64 bytes;
3627 u64 offset = offset_to_bitmap(ctl, start);
3628 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3629
3630 while (offset < end) {
3631 bool next_bitmap = false;
3632 struct btrfs_trim_range trim_entry;
3633
3634 mutex_lock(&ctl->cache_writeout_mutex);
3635 spin_lock(&ctl->tree_lock);
3636
3637 if (ctl->free_space < minlen) {
3638 block_group->discard_cursor =
3639 btrfs_block_group_end(block_group);
3640 spin_unlock(&ctl->tree_lock);
3641 mutex_unlock(&ctl->cache_writeout_mutex);
3642 break;
3643 }
3644
3645 entry = tree_search_offset(ctl, offset, 1, 0);
3646 /*
3647 * Bitmaps are marked trimmed lossily now to prevent constant
3648 * discarding of the same bitmap (the reason why we are bound
3649 * by the filters). So, retrim the block group bitmaps when we
3650 * are preparing to punt to the unused_bgs list. This uses
3651 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3652 * which is the only discard index which sets minlen to 0.
3653 */
3654 if (!entry || (async && minlen && start == offset &&
3655 btrfs_free_space_trimmed(entry))) {
3656 spin_unlock(&ctl->tree_lock);
3657 mutex_unlock(&ctl->cache_writeout_mutex);
3658 next_bitmap = true;
3659 goto next;
3660 }
3661
3662 /*
3663 * Async discard bitmap trimming begins at by setting the start
3664 * to be key.objectid and the offset_to_bitmap() aligns to the
3665 * start of the bitmap. This lets us know we are fully
3666 * scanning the bitmap rather than only some portion of it.
3667 */
3668 if (start == offset)
3669 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3670
3671 bytes = minlen;
3672 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3673 if (ret2 || start >= end) {
3674 /*
3675 * We lossily consider a bitmap trimmed if we only skip
3676 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3677 */
3678 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3679 end_trimming_bitmap(ctl, entry);
3680 else
3681 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3682 spin_unlock(&ctl->tree_lock);
3683 mutex_unlock(&ctl->cache_writeout_mutex);
3684 next_bitmap = true;
3685 goto next;
3686 }
3687
3688 /*
3689 * We already trimmed a region, but are using the locking above
3690 * to reset the trim_state.
3691 */
3692 if (async && *total_trimmed) {
3693 spin_unlock(&ctl->tree_lock);
3694 mutex_unlock(&ctl->cache_writeout_mutex);
3695 goto out;
3696 }
3697
3698 bytes = min(bytes, end - start);
3699 if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3700 spin_unlock(&ctl->tree_lock);
3701 mutex_unlock(&ctl->cache_writeout_mutex);
3702 goto next;
3703 }
3704
3705 /*
3706 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3707 * If X < @minlen, we won't trim X when we come back around.
3708 * So trim it now. We differ here from trimming extents as we
3709 * don't keep individual state per bit.
3710 */
3711 if (async &&
3712 max_discard_size &&
3713 bytes > (max_discard_size + minlen))
3714 bytes = max_discard_size;
3715
3716 bitmap_clear_bits(ctl, entry, start, bytes);
3717 if (entry->bytes == 0)
3718 free_bitmap(ctl, entry);
3719
3720 spin_unlock(&ctl->tree_lock);
3721 trim_entry.start = start;
3722 trim_entry.bytes = bytes;
3723 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3724 mutex_unlock(&ctl->cache_writeout_mutex);
3725
3726 ret = do_trimming(block_group, total_trimmed, start, bytes,
3727 start, bytes, 0, &trim_entry);
3728 if (ret) {
3729 reset_trimming_bitmap(ctl, offset);
3730 block_group->discard_cursor =
3731 btrfs_block_group_end(block_group);
3732 break;
3733 }
3734 next:
3735 if (next_bitmap) {
3736 offset += BITS_PER_BITMAP * ctl->unit;
3737 start = offset;
3738 } else {
3739 start += bytes;
3740 }
3741 block_group->discard_cursor = start;
3742
3743 if (fatal_signal_pending(current)) {
3744 if (start != offset)
3745 reset_trimming_bitmap(ctl, offset);
3746 ret = -ERESTARTSYS;
3747 break;
3748 }
3749
3750 cond_resched();
3751 }
3752
3753 if (offset >= end)
3754 block_group->discard_cursor = end;
3755
3756 out:
3757 return ret;
3758 }
3759
btrfs_trim_block_group(struct btrfs_block_group * block_group,u64 * trimmed,u64 start,u64 end,u64 minlen)3760 int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3761 u64 *trimmed, u64 start, u64 end, u64 minlen)
3762 {
3763 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3764 int ret;
3765 u64 rem = 0;
3766
3767 *trimmed = 0;
3768
3769 spin_lock(&block_group->lock);
3770 if (block_group->removed) {
3771 spin_unlock(&block_group->lock);
3772 return 0;
3773 }
3774 btrfs_freeze_block_group(block_group);
3775 spin_unlock(&block_group->lock);
3776
3777 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3778 if (ret)
3779 goto out;
3780
3781 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
3782 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
3783 /* If we ended in the middle of a bitmap, reset the trimming flag */
3784 if (rem)
3785 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3786 out:
3787 btrfs_unfreeze_block_group(block_group);
3788 return ret;
3789 }
3790
btrfs_trim_block_group_extents(struct btrfs_block_group * block_group,u64 * trimmed,u64 start,u64 end,u64 minlen,bool async)3791 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
3792 u64 *trimmed, u64 start, u64 end, u64 minlen,
3793 bool async)
3794 {
3795 int ret;
3796
3797 *trimmed = 0;
3798
3799 spin_lock(&block_group->lock);
3800 if (block_group->removed) {
3801 spin_unlock(&block_group->lock);
3802 return 0;
3803 }
3804 btrfs_freeze_block_group(block_group);
3805 spin_unlock(&block_group->lock);
3806
3807 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
3808 btrfs_unfreeze_block_group(block_group);
3809
3810 return ret;
3811 }
3812
btrfs_trim_block_group_bitmaps(struct btrfs_block_group * block_group,u64 * trimmed,u64 start,u64 end,u64 minlen,u64 maxlen,bool async)3813 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
3814 u64 *trimmed, u64 start, u64 end, u64 minlen,
3815 u64 maxlen, bool async)
3816 {
3817 int ret;
3818
3819 *trimmed = 0;
3820
3821 spin_lock(&block_group->lock);
3822 if (block_group->removed) {
3823 spin_unlock(&block_group->lock);
3824 return 0;
3825 }
3826 btrfs_freeze_block_group(block_group);
3827 spin_unlock(&block_group->lock);
3828
3829 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
3830 async);
3831
3832 btrfs_unfreeze_block_group(block_group);
3833
3834 return ret;
3835 }
3836
3837 /*
3838 * Find the left-most item in the cache tree, and then return the
3839 * smallest inode number in the item.
3840 *
3841 * Note: the returned inode number may not be the smallest one in
3842 * the tree, if the left-most item is a bitmap.
3843 */
btrfs_find_ino_for_alloc(struct btrfs_root * fs_root)3844 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3845 {
3846 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3847 struct btrfs_free_space *entry = NULL;
3848 u64 ino = 0;
3849
3850 spin_lock(&ctl->tree_lock);
3851
3852 if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3853 goto out;
3854
3855 entry = rb_entry(rb_first(&ctl->free_space_offset),
3856 struct btrfs_free_space, offset_index);
3857
3858 if (!entry->bitmap) {
3859 ino = entry->offset;
3860
3861 unlink_free_space(ctl, entry);
3862 entry->offset++;
3863 entry->bytes--;
3864 if (!entry->bytes)
3865 kmem_cache_free(btrfs_free_space_cachep, entry);
3866 else
3867 link_free_space(ctl, entry);
3868 } else {
3869 u64 offset = 0;
3870 u64 count = 1;
3871 int ret;
3872
3873 ret = search_bitmap(ctl, entry, &offset, &count, true);
3874 /* Logic error; Should be empty if it can't find anything */
3875 ASSERT(!ret);
3876
3877 ino = offset;
3878 bitmap_clear_bits(ctl, entry, offset, 1);
3879 if (entry->bytes == 0)
3880 free_bitmap(ctl, entry);
3881 }
3882 out:
3883 spin_unlock(&ctl->tree_lock);
3884
3885 return ino;
3886 }
3887
lookup_free_ino_inode(struct btrfs_root * root,struct btrfs_path * path)3888 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3889 struct btrfs_path *path)
3890 {
3891 struct inode *inode = NULL;
3892
3893 spin_lock(&root->ino_cache_lock);
3894 if (root->ino_cache_inode)
3895 inode = igrab(root->ino_cache_inode);
3896 spin_unlock(&root->ino_cache_lock);
3897 if (inode)
3898 return inode;
3899
3900 inode = __lookup_free_space_inode(root, path, 0);
3901 if (IS_ERR(inode))
3902 return inode;
3903
3904 spin_lock(&root->ino_cache_lock);
3905 if (!btrfs_fs_closing(root->fs_info))
3906 root->ino_cache_inode = igrab(inode);
3907 spin_unlock(&root->ino_cache_lock);
3908
3909 return inode;
3910 }
3911
create_free_ino_inode(struct btrfs_root * root,struct btrfs_trans_handle * trans,struct btrfs_path * path)3912 int create_free_ino_inode(struct btrfs_root *root,
3913 struct btrfs_trans_handle *trans,
3914 struct btrfs_path *path)
3915 {
3916 return __create_free_space_inode(root, trans, path,
3917 BTRFS_FREE_INO_OBJECTID, 0);
3918 }
3919
load_free_ino_cache(struct btrfs_fs_info * fs_info,struct btrfs_root * root)3920 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3921 {
3922 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3923 struct btrfs_path *path;
3924 struct inode *inode;
3925 int ret = 0;
3926 u64 root_gen = btrfs_root_generation(&root->root_item);
3927
3928 if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3929 return 0;
3930
3931 /*
3932 * If we're unmounting then just return, since this does a search on the
3933 * normal root and not the commit root and we could deadlock.
3934 */
3935 if (btrfs_fs_closing(fs_info))
3936 return 0;
3937
3938 path = btrfs_alloc_path();
3939 if (!path)
3940 return 0;
3941
3942 inode = lookup_free_ino_inode(root, path);
3943 if (IS_ERR(inode))
3944 goto out;
3945
3946 if (root_gen != BTRFS_I(inode)->generation)
3947 goto out_put;
3948
3949 ret = __load_free_space_cache(root, inode, ctl, path, 0);
3950
3951 if (ret < 0)
3952 btrfs_err(fs_info,
3953 "failed to load free ino cache for root %llu",
3954 root->root_key.objectid);
3955 out_put:
3956 iput(inode);
3957 out:
3958 btrfs_free_path(path);
3959 return ret;
3960 }
3961
btrfs_write_out_ino_cache(struct btrfs_root * root,struct btrfs_trans_handle * trans,struct btrfs_path * path,struct inode * inode)3962 int btrfs_write_out_ino_cache(struct btrfs_root *root,
3963 struct btrfs_trans_handle *trans,
3964 struct btrfs_path *path,
3965 struct inode *inode)
3966 {
3967 struct btrfs_fs_info *fs_info = root->fs_info;
3968 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3969 int ret;
3970 struct btrfs_io_ctl io_ctl;
3971 bool release_metadata = true;
3972
3973 if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3974 return 0;
3975
3976 memset(&io_ctl, 0, sizeof(io_ctl));
3977 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
3978 if (!ret) {
3979 /*
3980 * At this point writepages() didn't error out, so our metadata
3981 * reservation is released when the writeback finishes, at
3982 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3983 * with or without an error.
3984 */
3985 release_metadata = false;
3986 ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
3987 }
3988
3989 if (ret) {
3990 if (release_metadata)
3991 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3992 inode->i_size, true);
3993 btrfs_debug(fs_info,
3994 "failed to write free ino cache for root %llu error %d",
3995 root->root_key.objectid, ret);
3996 }
3997
3998 return ret;
3999 }
4000
4001 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4002 /*
4003 * Use this if you need to make a bitmap or extent entry specifically, it
4004 * doesn't do any of the merging that add_free_space does, this acts a lot like
4005 * how the free space cache loading stuff works, so you can get really weird
4006 * configurations.
4007 */
test_add_free_space_entry(struct btrfs_block_group * cache,u64 offset,u64 bytes,bool bitmap)4008 int test_add_free_space_entry(struct btrfs_block_group *cache,
4009 u64 offset, u64 bytes, bool bitmap)
4010 {
4011 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4012 struct btrfs_free_space *info = NULL, *bitmap_info;
4013 void *map = NULL;
4014 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4015 u64 bytes_added;
4016 int ret;
4017
4018 again:
4019 if (!info) {
4020 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4021 if (!info)
4022 return -ENOMEM;
4023 }
4024
4025 if (!bitmap) {
4026 spin_lock(&ctl->tree_lock);
4027 info->offset = offset;
4028 info->bytes = bytes;
4029 info->max_extent_size = 0;
4030 ret = link_free_space(ctl, info);
4031 spin_unlock(&ctl->tree_lock);
4032 if (ret)
4033 kmem_cache_free(btrfs_free_space_cachep, info);
4034 return ret;
4035 }
4036
4037 if (!map) {
4038 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4039 if (!map) {
4040 kmem_cache_free(btrfs_free_space_cachep, info);
4041 return -ENOMEM;
4042 }
4043 }
4044
4045 spin_lock(&ctl->tree_lock);
4046 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4047 1, 0);
4048 if (!bitmap_info) {
4049 info->bitmap = map;
4050 map = NULL;
4051 add_new_bitmap(ctl, info, offset);
4052 bitmap_info = info;
4053 info = NULL;
4054 }
4055
4056 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4057 trim_state);
4058
4059 bytes -= bytes_added;
4060 offset += bytes_added;
4061 spin_unlock(&ctl->tree_lock);
4062
4063 if (bytes)
4064 goto again;
4065
4066 if (info)
4067 kmem_cache_free(btrfs_free_space_cachep, info);
4068 if (map)
4069 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4070 return 0;
4071 }
4072
4073 /*
4074 * Checks to see if the given range is in the free space cache. This is really
4075 * just used to check the absence of space, so if there is free space in the
4076 * range at all we will return 1.
4077 */
test_check_exists(struct btrfs_block_group * cache,u64 offset,u64 bytes)4078 int test_check_exists(struct btrfs_block_group *cache,
4079 u64 offset, u64 bytes)
4080 {
4081 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4082 struct btrfs_free_space *info;
4083 int ret = 0;
4084
4085 spin_lock(&ctl->tree_lock);
4086 info = tree_search_offset(ctl, offset, 0, 0);
4087 if (!info) {
4088 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4089 1, 0);
4090 if (!info)
4091 goto out;
4092 }
4093
4094 have_info:
4095 if (info->bitmap) {
4096 u64 bit_off, bit_bytes;
4097 struct rb_node *n;
4098 struct btrfs_free_space *tmp;
4099
4100 bit_off = offset;
4101 bit_bytes = ctl->unit;
4102 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4103 if (!ret) {
4104 if (bit_off == offset) {
4105 ret = 1;
4106 goto out;
4107 } else if (bit_off > offset &&
4108 offset + bytes > bit_off) {
4109 ret = 1;
4110 goto out;
4111 }
4112 }
4113
4114 n = rb_prev(&info->offset_index);
4115 while (n) {
4116 tmp = rb_entry(n, struct btrfs_free_space,
4117 offset_index);
4118 if (tmp->offset + tmp->bytes < offset)
4119 break;
4120 if (offset + bytes < tmp->offset) {
4121 n = rb_prev(&tmp->offset_index);
4122 continue;
4123 }
4124 info = tmp;
4125 goto have_info;
4126 }
4127
4128 n = rb_next(&info->offset_index);
4129 while (n) {
4130 tmp = rb_entry(n, struct btrfs_free_space,
4131 offset_index);
4132 if (offset + bytes < tmp->offset)
4133 break;
4134 if (tmp->offset + tmp->bytes < offset) {
4135 n = rb_next(&tmp->offset_index);
4136 continue;
4137 }
4138 info = tmp;
4139 goto have_info;
4140 }
4141
4142 ret = 0;
4143 goto out;
4144 }
4145
4146 if (info->offset == offset) {
4147 ret = 1;
4148 goto out;
4149 }
4150
4151 if (offset > info->offset && offset < info->offset + info->bytes)
4152 ret = 1;
4153 out:
4154 spin_unlock(&ctl->tree_lock);
4155 return ret;
4156 }
4157 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
4158