Lines Matching +full:page +full:- +full:offset
1 // SPDX-License-Identifier: GPL-2.0-only
30 #include <linux/backing-dev.h>
64 static int least_priority = -1;
68 static const char Bad_offset[] = "Bad swap offset entry ";
69 static const char Unused_offset[] = "Unused swap offset entry ";
84 * swap_info_struct changes between not-full/full, it needs to
85 * add/remove itself to/from this list, but the swap_info_struct->lock
87 * before any swap_info_struct->lock.
120 * corresponding page
128 unsigned long offset, unsigned long flags) in __try_to_reclaim_swap() argument
130 swp_entry_t entry = swp_entry(si->type, offset); in __try_to_reclaim_swap()
131 struct page *page; in __try_to_reclaim_swap() local
134 page = find_get_page(swap_address_space(entry), offset); in __try_to_reclaim_swap()
135 if (!page) in __try_to_reclaim_swap()
139 * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, in __try_to_reclaim_swap()
144 if (trylock_page(page)) { in __try_to_reclaim_swap()
146 ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || in __try_to_reclaim_swap()
147 ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) in __try_to_reclaim_swap()
148 ret = try_to_free_swap(page); in __try_to_reclaim_swap()
149 unlock_page(page); in __try_to_reclaim_swap()
151 put_page(page); in __try_to_reclaim_swap()
157 struct rb_node *rb = rb_first(&sis->swap_extent_root); in first_se()
163 struct rb_node *rb = rb_next(&se->rb_node); in next_se()
169 * to allow the swap device to optimize its wear-levelling.
178 /* Do not discard the swap header page! */ in discard_swap()
180 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); in discard_swap()
181 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); in discard_swap()
183 err = blkdev_issue_discard(si->bdev, start_block, in discard_swap()
191 start_block = se->start_block << (PAGE_SHIFT - 9); in discard_swap()
192 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); in discard_swap()
194 err = blkdev_issue_discard(si->bdev, start_block, in discard_swap()
201 return err; /* That will often be -EOPNOTSUPP */ in discard_swap()
205 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) in offset_to_swap_extent() argument
210 rb = sis->swap_extent_root.rb_node; in offset_to_swap_extent()
213 if (offset < se->start_page) in offset_to_swap_extent()
214 rb = rb->rb_left; in offset_to_swap_extent()
215 else if (offset >= se->start_page + se->nr_pages) in offset_to_swap_extent()
216 rb = rb->rb_right; in offset_to_swap_extent()
224 sector_t swap_page_sector(struct page *page) in swap_page_sector() argument
226 struct swap_info_struct *sis = page_swap_info(page); in swap_page_sector()
229 pgoff_t offset; in swap_page_sector() local
231 offset = __page_file_index(page); in swap_page_sector()
232 se = offset_to_swap_extent(sis, offset); in swap_page_sector()
233 sector = se->start_block + (offset - se->start_page); in swap_page_sector()
234 return sector << (PAGE_SHIFT - 9); in swap_page_sector()
239 * to allow the swap device to optimize its wear-levelling.
247 pgoff_t offset = start_page - se->start_page; in discard_swap_cluster() local
248 sector_t start_block = se->start_block + offset; in discard_swap_cluster()
249 sector_t nr_blocks = se->nr_pages - offset; in discard_swap_cluster()
254 nr_pages -= nr_blocks; in discard_swap_cluster()
256 start_block <<= PAGE_SHIFT - 9; in discard_swap_cluster()
257 nr_blocks <<= PAGE_SHIFT - 9; in discard_swap_cluster()
258 if (blkdev_issue_discard(si->bdev, start_block, in discard_swap_cluster()
284 info->flags = flag; in cluster_set_flag()
289 return info->data; in cluster_count()
295 info->data = c; in cluster_set_count()
301 info->flags = f; in cluster_set_count_flag()
302 info->data = c; in cluster_set_count_flag()
307 return info->data; in cluster_next()
313 info->data = n; in cluster_set_next()
319 info->flags = f; in cluster_set_next_flag()
320 info->data = n; in cluster_set_next_flag()
325 return info->flags & CLUSTER_FLAG_FREE; in cluster_is_free()
330 return info->flags & CLUSTER_FLAG_NEXT_NULL; in cluster_is_null()
335 info->flags = CLUSTER_FLAG_NEXT_NULL; in cluster_set_null()
336 info->data = 0; in cluster_set_null()
342 return info->flags & CLUSTER_FLAG_HUGE; in cluster_is_huge()
348 info->flags &= ~CLUSTER_FLAG_HUGE; in cluster_clear_huge()
352 unsigned long offset) in lock_cluster() argument
356 ci = si->cluster_info; in lock_cluster()
358 ci += offset / SWAPFILE_CLUSTER; in lock_cluster()
359 spin_lock(&ci->lock); in lock_cluster()
367 spin_unlock(&ci->lock); in unlock_cluster()
372 * swap_cluster_info if SSD-style cluster-based locking is in place.
375 struct swap_info_struct *si, unsigned long offset) in lock_cluster_or_swap_info() argument
379 /* Try to use fine-grained SSD-style locking if available: */ in lock_cluster_or_swap_info()
380 ci = lock_cluster(si, offset); in lock_cluster_or_swap_info()
383 spin_lock(&si->lock); in lock_cluster_or_swap_info()
394 spin_unlock(&si->lock); in unlock_cluster_or_swap_info()
399 return cluster_is_null(&list->head); in cluster_list_empty()
404 return cluster_next(&list->head); in cluster_list_first()
409 cluster_set_null(&list->head); in cluster_list_init()
410 cluster_set_null(&list->tail); in cluster_list_init()
418 cluster_set_next_flag(&list->head, idx, 0); in cluster_list_add_tail()
419 cluster_set_next_flag(&list->tail, idx, 0); in cluster_list_add_tail()
422 unsigned int tail = cluster_next(&list->tail); in cluster_list_add_tail()
426 * only acquired when we held swap_info_struct->lock in cluster_list_add_tail()
429 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); in cluster_list_add_tail()
431 spin_unlock(&ci_tail->lock); in cluster_list_add_tail()
432 cluster_set_next_flag(&list->tail, idx, 0); in cluster_list_add_tail()
441 idx = cluster_next(&list->head); in cluster_list_del_first()
442 if (cluster_next(&list->tail) == idx) { in cluster_list_del_first()
443 cluster_set_null(&list->head); in cluster_list_del_first()
444 cluster_set_null(&list->tail); in cluster_list_del_first()
446 cluster_set_next_flag(&list->head, in cluster_list_del_first()
458 * si->swap_map directly. To make sure the discarding cluster isn't in swap_cluster_schedule_discard()
462 memset(si->swap_map + idx * SWAPFILE_CLUSTER, in swap_cluster_schedule_discard()
465 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); in swap_cluster_schedule_discard()
467 schedule_work(&si->discard_work); in swap_cluster_schedule_discard()
472 struct swap_cluster_info *ci = si->cluster_info; in __free_cluster()
475 cluster_list_add_tail(&si->free_clusters, ci, idx); in __free_cluster()
480 * will be added to free cluster list. caller should hold si->lock.
487 info = si->cluster_info; in swap_do_scheduled_discard()
489 while (!cluster_list_empty(&si->discard_clusters)) { in swap_do_scheduled_discard()
490 idx = cluster_list_del_first(&si->discard_clusters, info); in swap_do_scheduled_discard()
491 spin_unlock(&si->lock); in swap_do_scheduled_discard()
496 spin_lock(&si->lock); in swap_do_scheduled_discard()
499 memset(si->swap_map + idx * SWAPFILE_CLUSTER, in swap_do_scheduled_discard()
511 spin_lock(&si->lock); in swap_discard_work()
513 spin_unlock(&si->lock); in swap_discard_work()
518 struct swap_cluster_info *ci = si->cluster_info; in alloc_cluster()
520 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); in alloc_cluster()
521 cluster_list_del_first(&si->free_clusters, ci); in alloc_cluster()
527 struct swap_cluster_info *ci = si->cluster_info + idx; in free_cluster()
535 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == in free_cluster()
565 * counter becomes 0, which means no page in the cluster is in using, we can
578 cluster_count(&cluster_info[idx]) - 1); in dec_cluster_info_page()
590 unsigned long offset) in scan_swap_map_ssd_cluster_conflict() argument
595 offset /= SWAPFILE_CLUSTER; in scan_swap_map_ssd_cluster_conflict()
596 conflict = !cluster_list_empty(&si->free_clusters) && in scan_swap_map_ssd_cluster_conflict()
597 offset != cluster_list_first(&si->free_clusters) && in scan_swap_map_ssd_cluster_conflict()
598 cluster_is_free(&si->cluster_info[offset]); in scan_swap_map_ssd_cluster_conflict()
603 percpu_cluster = this_cpu_ptr(si->percpu_cluster); in scan_swap_map_ssd_cluster_conflict()
604 cluster_set_null(&percpu_cluster->index); in scan_swap_map_ssd_cluster_conflict()
613 unsigned long *offset, unsigned long *scan_base) in scan_swap_map_try_ssd_cluster() argument
620 cluster = this_cpu_ptr(si->percpu_cluster); in scan_swap_map_try_ssd_cluster()
621 if (cluster_is_null(&cluster->index)) { in scan_swap_map_try_ssd_cluster()
622 if (!cluster_list_empty(&si->free_clusters)) { in scan_swap_map_try_ssd_cluster()
623 cluster->index = si->free_clusters.head; in scan_swap_map_try_ssd_cluster()
624 cluster->next = cluster_next(&cluster->index) * in scan_swap_map_try_ssd_cluster()
626 } else if (!cluster_list_empty(&si->discard_clusters)) { in scan_swap_map_try_ssd_cluster()
630 * reread cluster_next_cpu since we dropped si->lock in scan_swap_map_try_ssd_cluster()
633 *scan_base = this_cpu_read(*si->cluster_next_cpu); in scan_swap_map_try_ssd_cluster()
634 *offset = *scan_base; in scan_swap_map_try_ssd_cluster()
644 tmp = cluster->next; in scan_swap_map_try_ssd_cluster()
645 max = min_t(unsigned long, si->max, in scan_swap_map_try_ssd_cluster()
646 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); in scan_swap_map_try_ssd_cluster()
650 if (!si->swap_map[tmp]) in scan_swap_map_try_ssd_cluster()
657 cluster_set_null(&cluster->index); in scan_swap_map_try_ssd_cluster()
660 cluster->next = tmp + 1; in scan_swap_map_try_ssd_cluster()
661 *offset = tmp; in scan_swap_map_try_ssd_cluster()
670 assert_spin_locked(&p->lock); in __del_from_avail_list()
672 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); in __del_from_avail_list()
682 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, in swap_range_alloc() argument
685 unsigned int end = offset + nr_entries - 1; in swap_range_alloc()
687 if (offset == si->lowest_bit) in swap_range_alloc()
688 si->lowest_bit += nr_entries; in swap_range_alloc()
689 if (end == si->highest_bit) in swap_range_alloc()
690 WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); in swap_range_alloc()
691 si->inuse_pages += nr_entries; in swap_range_alloc()
692 if (si->inuse_pages == si->pages) { in swap_range_alloc()
693 si->lowest_bit = si->max; in swap_range_alloc()
694 si->highest_bit = 0; in swap_range_alloc()
705 WARN_ON(!plist_node_empty(&p->avail_lists[nid])); in add_to_avail_list()
706 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); in add_to_avail_list()
711 static void swap_range_free(struct swap_info_struct *si, unsigned long offset, in swap_range_free() argument
714 unsigned long begin = offset; in swap_range_free()
715 unsigned long end = offset + nr_entries - 1; in swap_range_free()
718 if (offset < si->lowest_bit) in swap_range_free()
719 si->lowest_bit = offset; in swap_range_free()
720 if (end > si->highest_bit) { in swap_range_free()
721 bool was_full = !si->highest_bit; in swap_range_free()
723 WRITE_ONCE(si->highest_bit, end); in swap_range_free()
724 if (was_full && (si->flags & SWP_WRITEOK)) in swap_range_free()
728 si->inuse_pages -= nr_entries; in swap_range_free()
729 if (si->flags & SWP_BLKDEV) in swap_range_free()
731 si->bdev->bd_disk->fops->swap_slot_free_notify; in swap_range_free()
734 while (offset <= end) { in swap_range_free()
735 arch_swap_invalidate_page(si->type, offset); in swap_range_free()
736 frontswap_invalidate_page(si->type, offset); in swap_range_free()
738 swap_slot_free_notify(si->bdev, offset); in swap_range_free()
739 offset++; in swap_range_free()
741 clear_shadow_from_swap_cache(si->type, begin, end); in swap_range_free()
748 if (!(si->flags & SWP_SOLIDSTATE)) { in set_cluster_next()
749 si->cluster_next = next; in set_cluster_next()
753 prev = this_cpu_read(*si->cluster_next_cpu); in set_cluster_next()
762 if (si->highest_bit <= si->lowest_bit) in set_cluster_next()
764 next = si->lowest_bit + in set_cluster_next()
765 prandom_u32_max(si->highest_bit - si->lowest_bit + 1); in set_cluster_next()
767 next = max_t(unsigned int, next, si->lowest_bit); in set_cluster_next()
769 this_cpu_write(*si->cluster_next_cpu, next); in set_cluster_next()
777 unsigned long offset; in scan_swap_map_slots() local
787 * way, however, we resort to first-free allocation, starting in scan_swap_map_slots()
790 * overall disk seek times between swap pages. -- sct in scan_swap_map_slots()
791 * But we do now try to find an empty cluster. -Andrea in scan_swap_map_slots()
795 si->flags += SWP_SCANNING; in scan_swap_map_slots()
801 if (si->flags & SWP_SOLIDSTATE) in scan_swap_map_slots()
802 scan_base = this_cpu_read(*si->cluster_next_cpu); in scan_swap_map_slots()
804 scan_base = si->cluster_next; in scan_swap_map_slots()
805 offset = scan_base; in scan_swap_map_slots()
808 if (si->cluster_info) { in scan_swap_map_slots()
809 if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) in scan_swap_map_slots()
811 } else if (unlikely(!si->cluster_nr--)) { in scan_swap_map_slots()
812 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { in scan_swap_map_slots()
813 si->cluster_nr = SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
817 spin_unlock(&si->lock); in scan_swap_map_slots()
822 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info in scan_swap_map_slots()
825 scan_base = offset = si->lowest_bit; in scan_swap_map_slots()
826 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
829 for (; last_in_cluster <= si->highest_bit; offset++) { in scan_swap_map_slots()
830 if (si->swap_map[offset]) in scan_swap_map_slots()
831 last_in_cluster = offset + SWAPFILE_CLUSTER; in scan_swap_map_slots()
832 else if (offset == last_in_cluster) { in scan_swap_map_slots()
833 spin_lock(&si->lock); in scan_swap_map_slots()
834 offset -= SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
835 si->cluster_next = offset; in scan_swap_map_slots()
836 si->cluster_nr = SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
839 if (unlikely(--latency_ration < 0)) { in scan_swap_map_slots()
845 offset = scan_base; in scan_swap_map_slots()
846 spin_lock(&si->lock); in scan_swap_map_slots()
847 si->cluster_nr = SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
851 if (si->cluster_info) { in scan_swap_map_slots()
852 while (scan_swap_map_ssd_cluster_conflict(si, offset)) { in scan_swap_map_slots()
856 if (!scan_swap_map_try_ssd_cluster(si, &offset, in scan_swap_map_slots()
861 if (!(si->flags & SWP_WRITEOK)) in scan_swap_map_slots()
863 if (!si->highest_bit) in scan_swap_map_slots()
865 if (offset > si->highest_bit) in scan_swap_map_slots()
866 scan_base = offset = si->lowest_bit; in scan_swap_map_slots()
868 ci = lock_cluster(si, offset); in scan_swap_map_slots()
869 /* reuse swap entry of cache-only swap if not busy. */ in scan_swap_map_slots()
870 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { in scan_swap_map_slots()
873 spin_unlock(&si->lock); in scan_swap_map_slots()
874 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); in scan_swap_map_slots()
875 spin_lock(&si->lock); in scan_swap_map_slots()
882 if (si->swap_map[offset]) { in scan_swap_map_slots()
889 WRITE_ONCE(si->swap_map[offset], usage); in scan_swap_map_slots()
890 inc_cluster_info_page(si, si->cluster_info, offset); in scan_swap_map_slots()
893 swap_range_alloc(si, offset, 1); in scan_swap_map_slots()
894 slots[n_ret++] = swp_entry(si->type, offset); in scan_swap_map_slots()
897 if ((n_ret == nr) || (offset >= si->highest_bit)) in scan_swap_map_slots()
903 if (unlikely(--latency_ration < 0)) { in scan_swap_map_slots()
906 spin_unlock(&si->lock); in scan_swap_map_slots()
908 spin_lock(&si->lock); in scan_swap_map_slots()
913 if (si->cluster_info) { in scan_swap_map_slots()
914 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) in scan_swap_map_slots()
916 } else if (si->cluster_nr && !si->swap_map[++offset]) { in scan_swap_map_slots()
917 /* non-ssd case, still more slots in cluster? */ in scan_swap_map_slots()
918 --si->cluster_nr; in scan_swap_map_slots()
930 if (offset < scan_base) in scan_swap_map_slots()
933 scan_limit = si->highest_bit; in scan_swap_map_slots()
934 for (; offset <= scan_limit && --latency_ration > 0; in scan_swap_map_slots()
935 offset++) { in scan_swap_map_slots()
936 if (!si->swap_map[offset]) in scan_swap_map_slots()
942 set_cluster_next(si, offset + 1); in scan_swap_map_slots()
943 si->flags -= SWP_SCANNING; in scan_swap_map_slots()
947 spin_unlock(&si->lock); in scan_swap_map_slots()
948 while (++offset <= READ_ONCE(si->highest_bit)) { in scan_swap_map_slots()
949 if (data_race(!si->swap_map[offset])) { in scan_swap_map_slots()
950 spin_lock(&si->lock); in scan_swap_map_slots()
954 READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { in scan_swap_map_slots()
955 spin_lock(&si->lock); in scan_swap_map_slots()
958 if (unlikely(--latency_ration < 0)) { in scan_swap_map_slots()
964 offset = si->lowest_bit; in scan_swap_map_slots()
965 while (offset < scan_base) { in scan_swap_map_slots()
966 if (data_race(!si->swap_map[offset])) { in scan_swap_map_slots()
967 spin_lock(&si->lock); in scan_swap_map_slots()
971 READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { in scan_swap_map_slots()
972 spin_lock(&si->lock); in scan_swap_map_slots()
975 if (unlikely(--latency_ration < 0)) { in scan_swap_map_slots()
980 offset++; in scan_swap_map_slots()
982 spin_lock(&si->lock); in scan_swap_map_slots()
985 si->flags -= SWP_SCANNING; in scan_swap_map_slots()
993 unsigned long offset, i; in swap_alloc_cluster() local
998 * page swap is disabled. Warn and fail the allocation. in swap_alloc_cluster()
1005 if (cluster_list_empty(&si->free_clusters)) in swap_alloc_cluster()
1008 idx = cluster_list_first(&si->free_clusters); in swap_alloc_cluster()
1009 offset = idx * SWAPFILE_CLUSTER; in swap_alloc_cluster()
1010 ci = lock_cluster(si, offset); in swap_alloc_cluster()
1014 map = si->swap_map + offset; in swap_alloc_cluster()
1018 swap_range_alloc(si, offset, SWAPFILE_CLUSTER); in swap_alloc_cluster()
1019 *slot = swp_entry(si->type, offset); in swap_alloc_cluster()
1026 unsigned long offset = idx * SWAPFILE_CLUSTER; in swap_free_cluster() local
1029 ci = lock_cluster(si, offset); in swap_free_cluster()
1030 memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); in swap_free_cluster()
1034 swap_range_free(si, offset, SWAPFILE_CLUSTER); in swap_free_cluster()
1078 /* requeue si to after same-priority siblings */ in get_swap_pages()
1079 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); in get_swap_pages()
1081 spin_lock(&si->lock); in get_swap_pages()
1082 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { in get_swap_pages()
1084 if (plist_node_empty(&si->avail_lists[node])) { in get_swap_pages()
1085 spin_unlock(&si->lock); in get_swap_pages()
1088 WARN(!si->highest_bit, in get_swap_pages()
1090 si->type); in get_swap_pages()
1091 WARN(!(si->flags & SWP_WRITEOK), in get_swap_pages()
1093 si->type); in get_swap_pages()
1095 spin_unlock(&si->lock); in get_swap_pages()
1099 if (si->flags & SWP_BLKDEV) in get_swap_pages()
1104 spin_unlock(&si->lock); in get_swap_pages()
1107 pr_debug("scan_swap_map of si %d failed to find offset\n", in get_swap_pages()
1108 si->type); in get_swap_pages()
1115 * and since scan_swap_map() can drop the si->lock, multiple in get_swap_pages()
1116 * callers probably all tried to get a page from the same si in get_swap_pages()
1118 * up between us dropping swap_avail_lock and taking si->lock. in get_swap_pages()
1124 if (plist_node_empty(&next->avail_lists[node])) in get_swap_pages()
1132 atomic_long_add((long)(n_goal - n_ret) * size, in get_swap_pages()
1142 pgoff_t offset; in get_swap_page_of_type() local
1147 spin_lock(&si->lock); in get_swap_page_of_type()
1148 if (si->flags & SWP_WRITEOK) { in get_swap_page_of_type()
1150 offset = scan_swap_map(si, 1); in get_swap_page_of_type()
1151 if (offset) { in get_swap_page_of_type()
1153 spin_unlock(&si->lock); in get_swap_page_of_type()
1154 return swp_entry(type, offset); in get_swap_page_of_type()
1157 spin_unlock(&si->lock); in get_swap_page_of_type()
1165 unsigned long offset; in __swap_info_get() local
1172 if (data_race(!(p->flags & SWP_USED))) in __swap_info_get()
1174 offset = swp_offset(entry); in __swap_info_get()
1175 if (offset >= p->max) in __swap_info_get()
1198 if (data_race(!p->swap_map[swp_offset(entry)])) in _swap_info_get()
1214 spin_lock(&p->lock); in swap_info_get()
1227 spin_unlock(&q->lock); in swap_info_get_cont()
1229 spin_lock(&p->lock); in swap_info_get_cont()
1235 unsigned long offset, in __swap_entry_free_locked() argument
1241 count = p->swap_map[offset]; in __swap_entry_free_locked()
1257 if (swap_count_continued(p, offset, count)) in __swap_entry_free_locked()
1262 count--; in __swap_entry_free_locked()
1267 WRITE_ONCE(p->swap_map[offset], usage); in __swap_entry_free_locked()
1269 WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); in __swap_entry_free_locked()
1287 * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
1288 * true, the si->map, si->cluster_info, etc. must be valid in the
1294 * swapoff, such as page lock, page table lock, etc. The caller must
1310 * the page is read from the swap device, the PTE is verified not
1311 * changed with the page table locked to check whether the swap device
1317 unsigned long offset; in get_swap_device() local
1326 if (data_race(!(si->flags & SWP_VALID))) in get_swap_device()
1328 offset = swp_offset(entry); in get_swap_device()
1329 if (offset >= si->max) in get_swap_device()
1346 unsigned long offset = swp_offset(entry); in __swap_entry_free() local
1349 ci = lock_cluster_or_swap_info(p, offset); in __swap_entry_free()
1350 usage = __swap_entry_free_locked(p, offset, 1); in __swap_entry_free()
1361 unsigned long offset = swp_offset(entry); in swap_entry_free() local
1364 ci = lock_cluster(p, offset); in swap_entry_free()
1365 count = p->swap_map[offset]; in swap_entry_free()
1367 p->swap_map[offset] = 0; in swap_entry_free()
1368 dec_cluster_info_page(p, p->cluster_info, offset); in swap_entry_free()
1372 swap_range_free(p, offset, 1); in swap_entry_free()
1391 void put_swap_page(struct page *page, swp_entry_t entry) in put_swap_page() argument
1393 unsigned long offset = swp_offset(entry); in put_swap_page() local
1394 unsigned long idx = offset / SWAPFILE_CLUSTER; in put_swap_page()
1400 int size = swap_entry_size(thp_nr_pages(page)); in put_swap_page()
1406 ci = lock_cluster_or_swap_info(si, offset); in put_swap_page()
1409 map = si->swap_map + offset; in put_swap_page()
1419 spin_lock(&si->lock); in put_swap_page()
1422 spin_unlock(&si->lock); in put_swap_page()
1427 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { in put_swap_page()
1430 if (i == size - 1) in put_swap_page()
1432 lock_cluster_or_swap_info(si, offset); in put_swap_page()
1443 unsigned long offset = swp_offset(entry); in split_swap_cluster() local
1447 return -EBUSY; in split_swap_cluster()
1448 ci = lock_cluster(si, offset); in split_swap_cluster()
1459 return (int)swp_type(*e1) - (int)swp_type(*e2); in swp_entry_cmp()
1487 spin_unlock(&p->lock); in swapcache_free_entries()
1491 * How many references to page are currently swapped out?
1495 int page_swapcount(struct page *page) in page_swapcount() argument
1501 unsigned long offset; in page_swapcount() local
1503 entry.val = page_private(page); in page_swapcount()
1506 offset = swp_offset(entry); in page_swapcount()
1507 ci = lock_cluster_or_swap_info(p, offset); in page_swapcount()
1508 count = swap_count(p->swap_map[offset]); in page_swapcount()
1517 pgoff_t offset = swp_offset(entry); in __swap_count() local
1522 count = swap_count(si->swap_map[offset]); in __swap_count()
1531 pgoff_t offset = swp_offset(entry); in swap_swapcount() local
1534 ci = lock_cluster_or_swap_info(si, offset); in swap_swapcount()
1535 count = swap_count(si->swap_map[offset]); in swap_swapcount()
1567 struct page *page; in swp_swapcount() local
1568 pgoff_t offset; in swp_swapcount() local
1575 offset = swp_offset(entry); in swp_swapcount()
1577 ci = lock_cluster_or_swap_info(p, offset); in swp_swapcount()
1579 count = swap_count(p->swap_map[offset]); in swp_swapcount()
1586 page = vmalloc_to_page(p->swap_map + offset); in swp_swapcount()
1587 offset &= ~PAGE_MASK; in swp_swapcount()
1588 VM_BUG_ON(page_private(page) != SWP_CONTINUED); in swp_swapcount()
1591 page = list_next_entry(page, lru); in swp_swapcount()
1592 map = kmap_atomic(page); in swp_swapcount()
1593 tmp_count = map[offset]; in swp_swapcount()
1608 unsigned char *map = si->swap_map; in swap_page_trans_huge_swapped()
1610 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); in swap_page_trans_huge_swapped() local
1614 ci = lock_cluster_or_swap_info(si, offset); in swap_page_trans_huge_swapped()
1621 if (swap_count(map[offset + i])) { in swap_page_trans_huge_swapped()
1631 static bool page_swapped(struct page *page) in page_swapped() argument
1636 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) in page_swapped()
1637 return page_swapcount(page) != 0; in page_swapped()
1639 page = compound_head(page); in page_swapped()
1640 entry.val = page_private(page); in page_swapped()
1647 static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, in page_trans_huge_map_swapcount() argument
1651 unsigned long offset = 0; in page_trans_huge_map_swapcount() local
1658 VM_BUG_ON_PAGE(PageHuge(page), page); in page_trans_huge_map_swapcount()
1660 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { in page_trans_huge_map_swapcount()
1661 mapcount = page_trans_huge_mapcount(page, total_mapcount); in page_trans_huge_map_swapcount()
1662 if (PageSwapCache(page)) in page_trans_huge_map_swapcount()
1663 swapcount = page_swapcount(page); in page_trans_huge_map_swapcount()
1669 page = compound_head(page); in page_trans_huge_map_swapcount()
1672 if (PageSwapCache(page)) { in page_trans_huge_map_swapcount()
1675 entry.val = page_private(page); in page_trans_huge_map_swapcount()
1678 map = si->swap_map; in page_trans_huge_map_swapcount()
1679 offset = swp_offset(entry); in page_trans_huge_map_swapcount()
1683 ci = lock_cluster(si, offset); in page_trans_huge_map_swapcount()
1685 mapcount = atomic_read(&page[i]._mapcount) + 1; in page_trans_huge_map_swapcount()
1688 swapcount = swap_count(map[offset + i]); in page_trans_huge_map_swapcount()
1694 if (PageDoubleMap(page)) { in page_trans_huge_map_swapcount()
1695 map_swapcount -= 1; in page_trans_huge_map_swapcount()
1696 _total_mapcount -= HPAGE_PMD_NR; in page_trans_huge_map_swapcount()
1698 mapcount = compound_mapcount(page); in page_trans_huge_map_swapcount()
1710 * We can write to an anon page without COW if there are no other references
1711 * to it. And as a side-effect, free up its swap: because the old content
1719 bool reuse_swap_page(struct page *page, int *total_map_swapcount) in reuse_swap_page() argument
1723 VM_BUG_ON_PAGE(!PageLocked(page), page); in reuse_swap_page()
1724 if (unlikely(PageKsm(page))) in reuse_swap_page()
1726 count = page_trans_huge_map_swapcount(page, &total_mapcount, in reuse_swap_page()
1730 if (count == 1 && PageSwapCache(page) && in reuse_swap_page()
1731 (likely(!PageTransCompound(page)) || in reuse_swap_page()
1733 total_swapcount == page_swapcount(page))) { in reuse_swap_page()
1734 if (!PageWriteback(page)) { in reuse_swap_page()
1735 page = compound_head(page); in reuse_swap_page()
1736 delete_from_swap_cache(page); in reuse_swap_page()
1737 SetPageDirty(page); in reuse_swap_page()
1742 entry.val = page_private(page); in reuse_swap_page()
1744 if (p->flags & SWP_STABLE_WRITES) { in reuse_swap_page()
1745 spin_unlock(&p->lock); in reuse_swap_page()
1748 spin_unlock(&p->lock); in reuse_swap_page()
1756 * If swap is getting full, or if there are no more mappings of this page,
1759 int try_to_free_swap(struct page *page) in try_to_free_swap() argument
1761 VM_BUG_ON_PAGE(!PageLocked(page), page); in try_to_free_swap()
1763 if (!PageSwapCache(page)) in try_to_free_swap()
1765 if (PageWriteback(page)) in try_to_free_swap()
1767 if (page_swapped(page)) in try_to_free_swap()
1773 * - most probably a call from __try_to_reclaim_swap() while in try_to_free_swap()
1775 * but conceivably even a call from memory reclaim - will free in try_to_free_swap()
1776 * the swap from a page which has already been recorded in the in try_to_free_swap()
1777 * image as a clean swapcache page, and then reuse its swap for in try_to_free_swap()
1778 * another page of the image. On waking from hibernation, the in try_to_free_swap()
1779 * original page might be freed under memory pressure, then in try_to_free_swap()
1788 page = compound_head(page); in try_to_free_swap()
1789 delete_from_swap_cache(page); in try_to_free_swap()
1790 SetPageDirty(page); in try_to_free_swap()
1796 * free the page cache entry if it is the last user.
1808 if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) { in free_swap_and_cache()
1827 * @offset - number of the PAGE_SIZE-sized block of the device, starting
1832 int swap_type_of(dev_t device, sector_t offset) in swap_type_of() argument
1837 return -1; in swap_type_of()
1843 if (!(sis->flags & SWP_WRITEOK)) in swap_type_of()
1846 if (device == sis->bdev->bd_dev) { in swap_type_of()
1849 if (se->start_block == offset) { in swap_type_of()
1856 return -ENODEV; in swap_type_of()
1867 if (!(sis->flags & SWP_WRITEOK)) in find_first_swap()
1869 *device = sis->bdev->bd_dev; in find_first_swap()
1874 return -ENODEV; in find_first_swap()
1878 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1881 sector_t swapdev_block(int type, pgoff_t offset) in swapdev_block() argument
1886 if (!si || !(si->flags & SWP_WRITEOK)) in swapdev_block()
1888 return map_swap_entry(swp_entry(type, offset), &bdev); in swapdev_block()
1905 spin_lock(&sis->lock); in count_swap_pages()
1906 if (sis->flags & SWP_WRITEOK) { in count_swap_pages()
1907 n = sis->pages; in count_swap_pages()
1909 n -= sis->inuse_pages; in count_swap_pages()
1911 spin_unlock(&sis->lock); in count_swap_pages()
1925 * just let do_wp_page work it out if a write is requested later - to
1929 unsigned long addr, swp_entry_t entry, struct page *page) in unuse_pte() argument
1931 struct page *swapcache; in unuse_pte()
1936 swapcache = page; in unuse_pte()
1937 page = ksm_might_need_to_copy(page, vma, addr); in unuse_pte()
1938 if (unlikely(!page)) in unuse_pte()
1939 return -ENOMEM; in unuse_pte()
1941 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); in unuse_pte()
1947 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); in unuse_pte()
1948 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); in unuse_pte()
1949 get_page(page); in unuse_pte()
1950 set_pte_at(vma->vm_mm, addr, pte, in unuse_pte()
1951 pte_mkold(mk_pte(page, vma->vm_page_prot))); in unuse_pte()
1952 if (page == swapcache) { in unuse_pte()
1953 page_add_anon_rmap(page, vma, addr, false); in unuse_pte()
1955 page_add_new_anon_rmap(page, vma, addr, false); in unuse_pte()
1956 lru_cache_add_inactive_or_unevictable(page, vma); in unuse_pte()
1961 if (page != swapcache) { in unuse_pte()
1962 unlock_page(page); in unuse_pte()
1963 put_page(page); in unuse_pte()
1973 struct page *page; in unuse_pte_range() local
1977 unsigned long offset; in unuse_pte_range() local
1993 offset = swp_offset(entry); in unuse_pte_range()
1994 if (frontswap && !frontswap_test(si, offset)) in unuse_pte_range()
1998 swap_map = &si->swap_map[offset]; in unuse_pte_range()
1999 page = lookup_swap_cache(entry, vma, addr); in unuse_pte_range()
2000 if (!page) { in unuse_pte_range()
2004 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, in unuse_pte_range()
2007 if (!page) { in unuse_pte_range()
2010 return -ENOMEM; in unuse_pte_range()
2013 lock_page(page); in unuse_pte_range()
2014 wait_on_page_writeback(page); in unuse_pte_range()
2015 ret = unuse_pte(vma, pmd, addr, entry, page); in unuse_pte_range()
2017 unlock_page(page); in unuse_pte_range()
2018 put_page(page); in unuse_pte_range()
2022 try_to_free_swap(page); in unuse_pte_range()
2023 unlock_page(page); in unuse_pte_range()
2024 put_page(page); in unuse_pte_range()
2026 if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { in unuse_pte_range()
2033 pte_unmap(pte - 1); in unuse_pte_range()
2114 addr = vma->vm_start; in unuse_vma()
2115 end = vma->vm_end; in unuse_vma()
2117 pgd = pgd_offset(vma->vm_mm, addr); in unuse_vma()
2137 for (vma = mm->mmap; vma; vma = vma->vm_next) { in unuse_mm()
2138 if (vma->anon_vma) { in unuse_mm()
2167 for (i = prev + 1; i < si->max; i++) { in find_next_to_unuse()
2168 count = READ_ONCE(si->swap_map[i]); in find_next_to_unuse()
2176 if (i == si->max) in find_next_to_unuse()
2194 struct page *page; in try_to_unuse() local
2198 if (!READ_ONCE(si->inuse_pages)) in try_to_unuse()
2214 while (READ_ONCE(si->inuse_pages) && in try_to_unuse()
2216 (p = p->next) != &init_mm.mmlist) { in try_to_unuse()
2243 while (READ_ONCE(si->inuse_pages) && in try_to_unuse()
2248 page = find_get_page(swap_address_space(entry), i); in try_to_unuse()
2249 if (!page) in try_to_unuse()
2253 * It is conceivable that a racing task removed this page from in try_to_unuse()
2254 * swap cache just before we acquired the page lock. The page in try_to_unuse()
2258 lock_page(page); in try_to_unuse()
2259 wait_on_page_writeback(page); in try_to_unuse()
2260 try_to_free_swap(page); in try_to_unuse()
2261 unlock_page(page); in try_to_unuse()
2262 put_page(page); in try_to_unuse()
2269 if (pages_to_unuse && --pages_to_unuse == 0) in try_to_unuse()
2283 * It's easy and robust (though cpu-intensive) just to keep retrying. in try_to_unuse()
2285 if (READ_ONCE(si->inuse_pages)) { in try_to_unuse()
2288 retval = -EINTR; in try_to_unuse()
2298 * added to the mmlist just after page_duplicate - before would be racy.
2306 if (swap_info[type]->inuse_pages) in drain_mmlist()
2316 * corresponds to page offset for the specified swap entry.
2317 * Note that the type of this function is sector_t, but it returns page offset
2318 * into the bdev, not sector offset.
2324 pgoff_t offset; in map_swap_entry() local
2327 *bdev = sis->bdev; in map_swap_entry()
2329 offset = swp_offset(entry); in map_swap_entry()
2330 se = offset_to_swap_extent(sis, offset); in map_swap_entry()
2331 return se->start_block + (offset - se->start_page); in map_swap_entry()
2335 * Returns the page offset into bdev for the specified page's swap entry.
2337 sector_t map_swap_page(struct page *page, struct block_device **bdev) in map_swap_page() argument
2340 entry.val = page_private(page); in map_swap_page()
2349 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { in destroy_swap_extents()
2350 struct rb_node *rb = sis->swap_extent_root.rb_node; in destroy_swap_extents()
2353 rb_erase(rb, &sis->swap_extent_root); in destroy_swap_extents()
2357 if (sis->flags & SWP_ACTIVATED) { in destroy_swap_extents()
2358 struct file *swap_file = sis->swap_file; in destroy_swap_extents()
2359 struct address_space *mapping = swap_file->f_mapping; in destroy_swap_extents()
2361 sis->flags &= ~SWP_ACTIVATED; in destroy_swap_extents()
2362 if (mapping->a_ops->swap_deactivate) in destroy_swap_extents()
2363 mapping->a_ops->swap_deactivate(swap_file); in destroy_swap_extents()
2368 * Add a block range (and the corresponding page range) into this swapdev's
2371 * This function rather assumes that it is called in ascending page order.
2377 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; in add_swap_extent()
2383 * function is called in ascending page order. in add_swap_extent()
2387 link = &parent->rb_right; in add_swap_extent()
2392 BUG_ON(se->start_page + se->nr_pages != start_page); in add_swap_extent()
2393 if (se->start_block + se->nr_pages == start_block) { in add_swap_extent()
2395 se->nr_pages += nr_pages; in add_swap_extent()
2403 return -ENOMEM; in add_swap_extent()
2404 new_se->start_page = start_page; in add_swap_extent()
2405 new_se->nr_pages = nr_pages; in add_swap_extent()
2406 new_se->start_block = start_block; in add_swap_extent()
2408 rb_link_node(&new_se->rb_node, parent, link); in add_swap_extent()
2409 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); in add_swap_extent()
2418 * time for locating where on disk a page belongs.
2431 * requirements, they are simply tossed out - we will never use those blocks
2438 * Typically it is in the 1-4 megabyte range. So we can have hundreds of
2442 * map_swap_page() has been measured at about 0.3 per page. - akpm.
2446 struct file *swap_file = sis->swap_file; in setup_swap_extents()
2447 struct address_space *mapping = swap_file->f_mapping; in setup_swap_extents()
2448 struct inode *inode = mapping->host; in setup_swap_extents()
2451 if (S_ISBLK(inode->i_mode)) { in setup_swap_extents()
2452 ret = add_swap_extent(sis, 0, sis->max, 0); in setup_swap_extents()
2453 *span = sis->pages; in setup_swap_extents()
2457 if (mapping->a_ops->swap_activate) { in setup_swap_extents()
2458 ret = mapping->a_ops->swap_activate(sis, swap_file, span); in setup_swap_extents()
2460 sis->flags |= SWP_ACTIVATED; in setup_swap_extents()
2462 sis->flags |= SWP_FS_OPS; in setup_swap_extents()
2463 ret = add_swap_extent(sis, 0, sis->max, 0); in setup_swap_extents()
2464 *span = sis->pages; in setup_swap_extents()
2476 if (p->bdev) in swap_node()
2477 bdev = p->bdev; in swap_node()
2479 bdev = p->swap_file->f_inode->i_sb->s_bdev; in swap_node()
2481 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; in swap_node()
2491 p->prio = prio; in setup_swap_info()
2493 p->prio = --least_priority; in setup_swap_info()
2496 * low-to-high, while swap ordering is high-to-low in setup_swap_info()
2498 p->list.prio = -p->prio; in setup_swap_info()
2500 if (p->prio >= 0) in setup_swap_info()
2501 p->avail_lists[i].prio = -p->prio; in setup_swap_info()
2504 p->avail_lists[i].prio = 1; in setup_swap_info()
2506 p->avail_lists[i].prio = -p->prio; in setup_swap_info()
2509 p->swap_map = swap_map; in setup_swap_info()
2510 p->cluster_info = cluster_info; in setup_swap_info()
2515 p->flags |= SWP_WRITEOK | SWP_VALID; in _enable_swap_info()
2516 atomic_long_add(p->pages, &nr_swap_pages); in _enable_swap_info()
2517 total_swap_pages += p->pages; in _enable_swap_info()
2523 * which on removal of any swap_info_struct with an auto-assigned in _enable_swap_info()
2524 * (i.e. negative) priority increments the auto-assigned priority in _enable_swap_info()
2525 * of any lower-priority swap_info_structs. in _enable_swap_info()
2530 plist_add(&p->list, &swap_active_head); in _enable_swap_info()
2539 frontswap_init(p->type, frontswap_map); in enable_swap_info()
2541 spin_lock(&p->lock); in enable_swap_info()
2543 spin_unlock(&p->lock); in enable_swap_info()
2551 spin_lock(&p->lock); in enable_swap_info()
2553 spin_unlock(&p->lock); in enable_swap_info()
2560 spin_lock(&p->lock); in reinsert_swap_info()
2561 setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); in reinsert_swap_info()
2563 spin_unlock(&p->lock); in reinsert_swap_info()
2592 return -EPERM; in SYSCALL_DEFINE1()
2594 BUG_ON(!current->mm); in SYSCALL_DEFINE1()
2605 mapping = victim->f_mapping; in SYSCALL_DEFINE1()
2608 if (p->flags & SWP_WRITEOK) { in SYSCALL_DEFINE1()
2609 if (p->swap_file->f_mapping == mapping) { in SYSCALL_DEFINE1()
2616 err = -EINVAL; in SYSCALL_DEFINE1()
2620 if (!security_vm_enough_memory_mm(current->mm, p->pages)) in SYSCALL_DEFINE1()
2621 vm_unacct_memory(p->pages); in SYSCALL_DEFINE1()
2623 err = -ENOMEM; in SYSCALL_DEFINE1()
2627 spin_lock(&p->lock); in SYSCALL_DEFINE1()
2629 if (p->prio < 0) { in SYSCALL_DEFINE1()
2634 si->prio++; in SYSCALL_DEFINE1()
2635 si->list.prio--; in SYSCALL_DEFINE1()
2637 if (si->avail_lists[nid].prio != 1) in SYSCALL_DEFINE1()
2638 si->avail_lists[nid].prio--; in SYSCALL_DEFINE1()
2643 plist_del(&p->list, &swap_active_head); in SYSCALL_DEFINE1()
2644 atomic_long_sub(p->pages, &nr_swap_pages); in SYSCALL_DEFINE1()
2645 total_swap_pages -= p->pages; in SYSCALL_DEFINE1()
2646 p->flags &= ~SWP_WRITEOK; in SYSCALL_DEFINE1()
2647 spin_unlock(&p->lock); in SYSCALL_DEFINE1()
2653 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ in SYSCALL_DEFINE1()
2657 /* re-insert swap space back into swap_list */ in SYSCALL_DEFINE1()
2666 spin_lock(&p->lock); in SYSCALL_DEFINE1()
2667 p->flags &= ~SWP_VALID; /* mark swap device as invalid */ in SYSCALL_DEFINE1()
2668 spin_unlock(&p->lock); in SYSCALL_DEFINE1()
2676 flush_work(&p->discard_work); in SYSCALL_DEFINE1()
2679 if (p->flags & SWP_CONTINUED) in SYSCALL_DEFINE1()
2682 if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) in SYSCALL_DEFINE1()
2687 spin_lock(&p->lock); in SYSCALL_DEFINE1()
2691 p->highest_bit = 0; /* cuts scans short */ in SYSCALL_DEFINE1()
2692 while (p->flags >= SWP_SCANNING) { in SYSCALL_DEFINE1()
2693 spin_unlock(&p->lock); in SYSCALL_DEFINE1()
2697 spin_lock(&p->lock); in SYSCALL_DEFINE1()
2700 swap_file = p->swap_file; in SYSCALL_DEFINE1()
2701 old_block_size = p->old_block_size; in SYSCALL_DEFINE1()
2702 p->swap_file = NULL; in SYSCALL_DEFINE1()
2703 p->max = 0; in SYSCALL_DEFINE1()
2704 swap_map = p->swap_map; in SYSCALL_DEFINE1()
2705 p->swap_map = NULL; in SYSCALL_DEFINE1()
2706 cluster_info = p->cluster_info; in SYSCALL_DEFINE1()
2707 p->cluster_info = NULL; in SYSCALL_DEFINE1()
2709 spin_unlock(&p->lock); in SYSCALL_DEFINE1()
2711 arch_swap_invalidate_area(p->type); in SYSCALL_DEFINE1()
2712 frontswap_invalidate_area(p->type); in SYSCALL_DEFINE1()
2715 free_percpu(p->percpu_cluster); in SYSCALL_DEFINE1()
2716 p->percpu_cluster = NULL; in SYSCALL_DEFINE1()
2717 free_percpu(p->cluster_next_cpu); in SYSCALL_DEFINE1()
2718 p->cluster_next_cpu = NULL; in SYSCALL_DEFINE1()
2723 swap_cgroup_swapoff(p->type); in SYSCALL_DEFINE1()
2724 exit_swap_address_space(p->type); in SYSCALL_DEFINE1()
2726 inode = mapping->host; in SYSCALL_DEFINE1()
2727 if (S_ISBLK(inode->i_mode)) { in SYSCALL_DEFINE1()
2735 inode->i_flags &= ~S_SWAPFILE; in SYSCALL_DEFINE1()
2742 * not hold p->lock after we cleared its SWP_WRITEOK. in SYSCALL_DEFINE1()
2745 p->flags = 0; in SYSCALL_DEFINE1()
2762 struct seq_file *seq = file->private_data; in swaps_poll()
2766 if (seq->poll_event != atomic_read(&proc_poll_event)) { in swaps_poll()
2767 seq->poll_event = atomic_read(&proc_poll_event); in swaps_poll()
2787 if (!(si->flags & SWP_USED) || !si->swap_map) in swap_start()
2789 if (!--l) in swap_start()
2804 type = si->type + 1; in swap_next()
2808 if (!(si->flags & SWP_USED) || !si->swap_map) in swap_next()
2833 bytes = si->pages << (PAGE_SHIFT - 10); in swap_show()
2834 inuse = si->inuse_pages << (PAGE_SHIFT - 10); in swap_show()
2836 file = si->swap_file; in swap_show()
2839 len < 40 ? 40 - len : 1, " ", in swap_show()
2840 S_ISBLK(file_inode(file)->i_mode) ? in swap_show()
2844 si->prio); in swap_show()
2864 seq = file->private_data; in swaps_open()
2865 seq->poll_event = atomic_read(&proc_poll_event); in swaps_open()
2904 return ERR_PTR(-ENOMEM); in alloc_swap_info()
2908 if (!(swap_info[type]->flags & SWP_USED)) in alloc_swap_info()
2914 return ERR_PTR(-EPERM); in alloc_swap_info()
2917 p->type = type; in alloc_swap_info()
2931 * would be relying on p->type to remain valid. in alloc_swap_info()
2934 p->swap_extent_root = RB_ROOT; in alloc_swap_info()
2935 plist_node_init(&p->list, 0); in alloc_swap_info()
2937 plist_node_init(&p->avail_lists[i], 0); in alloc_swap_info()
2938 p->flags = SWP_USED; in alloc_swap_info()
2941 spin_lock_init(&p->lock); in alloc_swap_info()
2942 spin_lock_init(&p->cont_lock); in alloc_swap_info()
2951 if (S_ISBLK(inode->i_mode)) { in claim_swapfile()
2952 p->bdev = blkdev_get_by_dev(inode->i_rdev, in claim_swapfile()
2954 if (IS_ERR(p->bdev)) { in claim_swapfile()
2955 error = PTR_ERR(p->bdev); in claim_swapfile()
2956 p->bdev = NULL; in claim_swapfile()
2959 p->old_block_size = block_size(p->bdev); in claim_swapfile()
2960 error = set_blocksize(p->bdev, PAGE_SIZE); in claim_swapfile()
2968 if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) in claim_swapfile()
2969 return -EINVAL; in claim_swapfile()
2970 p->flags |= SWP_BLKDEV; in claim_swapfile()
2971 } else if (S_ISREG(inode->i_mode)) { in claim_swapfile()
2972 p->bdev = inode->i_sb->s_bdev; in claim_swapfile()
2982 * 1) the number of bits for the swap offset in the swp_entry_t type, and
2987 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
2988 * decoded to a swp_entry_t again, and finally the swap offset is
3016 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { in read_swap_header()
3017 pr_err("Unable to find swap-space signature\n"); in read_swap_header()
3022 if (swab32(swap_header->info.version) == 1) { in read_swap_header()
3023 swab32s(&swap_header->info.version); in read_swap_header()
3024 swab32s(&swap_header->info.last_page); in read_swap_header()
3025 swab32s(&swap_header->info.nr_badpages); in read_swap_header()
3026 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) in read_swap_header()
3028 for (i = 0; i < swap_header->info.nr_badpages; i++) in read_swap_header()
3029 swab32s(&swap_header->info.badpages[i]); in read_swap_header()
3031 /* Check the swap header's sub-version */ in read_swap_header()
3032 if (swap_header->info.version != 1) { in read_swap_header()
3034 swap_header->info.version); in read_swap_header()
3038 p->lowest_bit = 1; in read_swap_header()
3039 p->cluster_next = 1; in read_swap_header()
3040 p->cluster_nr = 0; in read_swap_header()
3043 last_page = swap_header->info.last_page; in read_swap_header()
3045 pr_warn("Empty swap-file\n"); in read_swap_header()
3050 maxpages << (PAGE_SHIFT - 10), in read_swap_header()
3051 last_page << (PAGE_SHIFT - 10)); in read_swap_header()
3055 /* p->max is an unsigned int: don't overflow it */ in read_swap_header()
3059 p->highest_bit = maxpages - 1; in read_swap_header()
3068 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) in read_swap_header()
3070 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) in read_swap_header()
3094 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; in setup_swap_map_and_extents()
3097 nr_good_pages = maxpages - 1; /* omit header page */ in setup_swap_map_and_extents()
3099 cluster_list_init(&p->free_clusters); in setup_swap_map_and_extents()
3100 cluster_list_init(&p->discard_clusters); in setup_swap_map_and_extents()
3102 for (i = 0; i < swap_header->info.nr_badpages; i++) { in setup_swap_map_and_extents()
3103 unsigned int page_nr = swap_header->info.badpages[i]; in setup_swap_map_and_extents()
3104 if (page_nr == 0 || page_nr > swap_header->info.last_page) in setup_swap_map_and_extents()
3105 return -EINVAL; in setup_swap_map_and_extents()
3108 nr_good_pages--; in setup_swap_map_and_extents()
3128 p->max = maxpages; in setup_swap_map_and_extents()
3129 p->pages = nr_good_pages; in setup_swap_map_and_extents()
3133 nr_good_pages = p->pages; in setup_swap_map_and_extents()
3136 pr_warn("Empty swap-file\n"); in setup_swap_map_and_extents()
3137 return -EINVAL; in setup_swap_map_and_extents()
3157 cluster_list_add_tail(&p->free_clusters, cluster_info, in setup_swap_map_and_extents()
3170 struct request_queue *q = bdev_get_queue(si->bdev); in swap_discardable()
3193 struct page *page = NULL; in SYSCALL_DEFINE2() local
3198 return -EINVAL; in SYSCALL_DEFINE2()
3201 return -EPERM; in SYSCALL_DEFINE2()
3204 return -ENOMEM; in SYSCALL_DEFINE2()
3210 INIT_WORK(&p->discard_work, swap_discard_work); in SYSCALL_DEFINE2()
3225 p->swap_file = swap_file; in SYSCALL_DEFINE2()
3226 mapping = swap_file->f_mapping; in SYSCALL_DEFINE2()
3227 inode = mapping->host; in SYSCALL_DEFINE2()
3235 error = -EBUSY; in SYSCALL_DEFINE2()
3242 if (!mapping->a_ops->readpage) { in SYSCALL_DEFINE2()
3243 error = -EINVAL; in SYSCALL_DEFINE2()
3246 page = read_mapping_page(mapping, 0, swap_file); in SYSCALL_DEFINE2()
3247 if (IS_ERR(page)) { in SYSCALL_DEFINE2()
3248 error = PTR_ERR(page); in SYSCALL_DEFINE2()
3251 swap_header = kmap(page); in SYSCALL_DEFINE2()
3255 error = -EINVAL; in SYSCALL_DEFINE2()
3262 error = -ENOMEM; in SYSCALL_DEFINE2()
3266 if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) in SYSCALL_DEFINE2()
3267 p->flags |= SWP_STABLE_WRITES; in SYSCALL_DEFINE2()
3269 if (p->bdev && p->bdev->bd_disk->fops->rw_page) in SYSCALL_DEFINE2()
3270 p->flags |= SWP_SYNCHRONOUS_IO; in SYSCALL_DEFINE2()
3272 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { in SYSCALL_DEFINE2()
3276 p->flags |= SWP_SOLIDSTATE; in SYSCALL_DEFINE2()
3277 p->cluster_next_cpu = alloc_percpu(unsigned int); in SYSCALL_DEFINE2()
3278 if (!p->cluster_next_cpu) { in SYSCALL_DEFINE2()
3279 error = -ENOMEM; in SYSCALL_DEFINE2()
3287 per_cpu(*p->cluster_next_cpu, cpu) = in SYSCALL_DEFINE2()
3288 1 + prandom_u32_max(p->highest_bit); in SYSCALL_DEFINE2()
3295 error = -ENOMEM; in SYSCALL_DEFINE2()
3300 spin_lock_init(&((cluster_info + ci)->lock)); in SYSCALL_DEFINE2()
3302 p->percpu_cluster = alloc_percpu(struct percpu_cluster); in SYSCALL_DEFINE2()
3303 if (!p->percpu_cluster) { in SYSCALL_DEFINE2()
3304 error = -ENOMEM; in SYSCALL_DEFINE2()
3309 cluster = per_cpu_ptr(p->percpu_cluster, cpu); in SYSCALL_DEFINE2()
3310 cluster_set_null(&cluster->index); in SYSCALL_DEFINE2()
3317 error = swap_cgroup_swapon(p->type, maxpages); in SYSCALL_DEFINE2()
3327 /* frontswap enabled? set up bit-per-page map for frontswap */ in SYSCALL_DEFINE2()
3333 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { in SYSCALL_DEFINE2()
3340 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | in SYSCALL_DEFINE2()
3345 * either do single-time area discards only, or to just in SYSCALL_DEFINE2()
3346 * perform discards for released swap page-clusters. in SYSCALL_DEFINE2()
3347 * Now it's time to adjust the p->flags accordingly. in SYSCALL_DEFINE2()
3350 p->flags &= ~SWP_PAGE_DISCARD; in SYSCALL_DEFINE2()
3352 p->flags &= ~SWP_AREA_DISCARD; in SYSCALL_DEFINE2()
3354 /* issue a swapon-time discard if it's still required */ in SYSCALL_DEFINE2()
3355 if (p->flags & SWP_AREA_DISCARD) { in SYSCALL_DEFINE2()
3363 error = init_swap_address_space(p->type, maxpages); in SYSCALL_DEFINE2()
3371 inode->i_flags |= S_SWAPFILE; in SYSCALL_DEFINE2()
3374 inode->i_flags &= ~S_SWAPFILE; in SYSCALL_DEFINE2()
3379 prio = -1; in SYSCALL_DEFINE2()
3386 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, in SYSCALL_DEFINE2()
3387 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), in SYSCALL_DEFINE2()
3388 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", in SYSCALL_DEFINE2()
3389 (p->flags & SWP_DISCARDABLE) ? "D" : "", in SYSCALL_DEFINE2()
3390 (p->flags & SWP_AREA_DISCARD) ? "s" : "", in SYSCALL_DEFINE2()
3391 (p->flags & SWP_PAGE_DISCARD) ? "c" : "", in SYSCALL_DEFINE2()
3401 exit_swap_address_space(p->type); in SYSCALL_DEFINE2()
3405 free_percpu(p->percpu_cluster); in SYSCALL_DEFINE2()
3406 p->percpu_cluster = NULL; in SYSCALL_DEFINE2()
3407 free_percpu(p->cluster_next_cpu); in SYSCALL_DEFINE2()
3408 p->cluster_next_cpu = NULL; in SYSCALL_DEFINE2()
3409 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { in SYSCALL_DEFINE2()
3410 set_blocksize(p->bdev, p->old_block_size); in SYSCALL_DEFINE2()
3411 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); in SYSCALL_DEFINE2()
3415 swap_cgroup_swapoff(p->type); in SYSCALL_DEFINE2()
3417 p->swap_file = NULL; in SYSCALL_DEFINE2()
3418 p->flags = 0; in SYSCALL_DEFINE2()
3428 if (page && !IS_ERR(page)) { in SYSCALL_DEFINE2()
3429 kunmap(page); in SYSCALL_DEFINE2()
3430 put_page(page); in SYSCALL_DEFINE2()
3450 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) in si_swapinfo()
3451 nr_to_be_unused += si->inuse_pages; in si_swapinfo()
3453 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; in si_swapinfo()
3454 val->totalswap = total_swap_pages + nr_to_be_unused; in si_swapinfo()
3469 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) in free_swap_is_low()
3470 nr_to_be_unused += si->inuse_pages; in free_swap_is_low()
3484 * - success -> 0
3485 * - swp_entry is invalid -> EINVAL
3486 * - swp_entry is migration entry -> EINVAL
3487 * - swap-cache reference is requested but there is already one. -> EEXIST
3488 * - swap-cache reference is requested but the entry is not used. -> ENOENT
3489 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
3495 unsigned long offset; in __swap_duplicate() local
3498 int err = -EINVAL; in __swap_duplicate()
3504 offset = swp_offset(entry); in __swap_duplicate()
3505 ci = lock_cluster_or_swap_info(p, offset); in __swap_duplicate()
3507 count = p->swap_map[offset]; in __swap_duplicate()
3514 err = -ENOENT; in __swap_duplicate()
3528 err = -EEXIST; in __swap_duplicate()
3530 err = -ENOENT; in __swap_duplicate()
3537 err = -EINVAL; in __swap_duplicate()
3538 else if (swap_count_continued(p, offset, count)) in __swap_duplicate()
3541 err = -ENOMEM; in __swap_duplicate()
3543 err = -ENOENT; /* unused swap entry */ in __swap_duplicate()
3545 WRITE_ONCE(p->swap_map[offset], count | has_cache); in __swap_duplicate()
3566 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
3568 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
3569 * might occur if a page table entry has got corrupted.
3575 while (!err && __swap_duplicate(entry, 1) == -ENOMEM) in swap_duplicate()
3585 * -EEXIST means there is a swap cache.
3596 unsigned long offset = swp_offset(entry); in swapcache_clear() local
3599 ci = lock_cluster_or_swap_info(si, offset); in swapcache_clear()
3600 usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); in swapcache_clear()
3611 struct swap_info_struct *page_swap_info(struct page *page) in page_swap_info() argument
3613 swp_entry_t entry = { .val = page_private(page) }; in page_swap_info()
3618 * out-of-line __page_file_ methods to avoid include hell.
3620 struct address_space *__page_file_mapping(struct page *page) in __page_file_mapping() argument
3622 return page_swap_info(page)->swap_file->f_mapping; in __page_file_mapping()
3626 pgoff_t __page_file_index(struct page *page) in __page_file_index() argument
3628 swp_entry_t swap = { .val = page_private(page) }; in __page_file_index()
3634 * add_swap_count_continuation - called when a swap count is duplicated
3635 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
3636 * page of the original vmalloc'ed swap_map, to hold the continuation count
3641 * on the original swap_map, only referring to a continuation page when the
3645 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
3652 struct page *head; in add_swap_count_continuation()
3653 struct page *page; in add_swap_count_continuation() local
3654 struct page *list_page; in add_swap_count_continuation()
3655 pgoff_t offset; in add_swap_count_continuation() local
3661 * for latency not to zero a page while GFP_ATOMIC and holding locks. in add_swap_count_continuation()
3663 page = alloc_page(gfp_mask | __GFP_HIGHMEM); in add_swap_count_continuation()
3673 spin_lock(&si->lock); in add_swap_count_continuation()
3675 offset = swp_offset(entry); in add_swap_count_continuation()
3677 ci = lock_cluster(si, offset); in add_swap_count_continuation()
3679 count = si->swap_map[offset] & ~SWAP_HAS_CACHE; in add_swap_count_continuation()
3685 * over-provisioning. in add_swap_count_continuation()
3690 if (!page) { in add_swap_count_continuation()
3691 ret = -ENOMEM; in add_swap_count_continuation()
3697 * no architecture is using highmem pages for kernel page tables: so it in add_swap_count_continuation()
3698 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. in add_swap_count_continuation()
3700 head = vmalloc_to_page(si->swap_map + offset); in add_swap_count_continuation()
3701 offset &= ~PAGE_MASK; in add_swap_count_continuation()
3703 spin_lock(&si->cont_lock); in add_swap_count_continuation()
3705 * Page allocation does not initialize the page's lru field, in add_swap_count_continuation()
3710 INIT_LIST_HEAD(&head->lru); in add_swap_count_continuation()
3712 si->flags |= SWP_CONTINUED; in add_swap_count_continuation()
3715 list_for_each_entry(list_page, &head->lru, lru) { in add_swap_count_continuation()
3720 * a continuation page, free our allocation and use this one. in add_swap_count_continuation()
3725 map = kmap_atomic(list_page) + offset; in add_swap_count_continuation()
3737 list_add_tail(&page->lru, &head->lru); in add_swap_count_continuation()
3738 page = NULL; /* now it's attached, don't free it */ in add_swap_count_continuation()
3740 spin_unlock(&si->cont_lock); in add_swap_count_continuation()
3743 spin_unlock(&si->lock); in add_swap_count_continuation()
3746 if (page) in add_swap_count_continuation()
3747 __free_page(page); in add_swap_count_continuation()
3752 * swap_count_continued - when the original swap_map count is incremented
3753 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
3754 * into, carry if so, or else fail until a new continuation page is allocated;
3761 pgoff_t offset, unsigned char count) in swap_count_continued() argument
3763 struct page *head; in swap_count_continued()
3764 struct page *page; in swap_count_continued() local
3768 head = vmalloc_to_page(si->swap_map + offset); in swap_count_continued()
3774 spin_lock(&si->cont_lock); in swap_count_continued()
3775 offset &= ~PAGE_MASK; in swap_count_continued()
3776 page = list_next_entry(head, lru); in swap_count_continued()
3777 map = kmap_atomic(page) + offset; in swap_count_continued()
3788 page = list_next_entry(page, lru); in swap_count_continued()
3789 BUG_ON(page == head); in swap_count_continued()
3790 map = kmap_atomic(page) + offset; in swap_count_continued()
3794 page = list_next_entry(page, lru); in swap_count_continued()
3795 if (page == head) { in swap_count_continued()
3799 map = kmap_atomic(page) + offset; in swap_count_continued()
3800 init_map: *map = 0; /* we didn't zero the page */ in swap_count_continued()
3804 while ((page = list_prev_entry(page, lru)) != head) { in swap_count_continued()
3805 map = kmap_atomic(page) + offset; in swap_count_continued()
3818 page = list_next_entry(page, lru); in swap_count_continued()
3819 BUG_ON(page == head); in swap_count_continued()
3820 map = kmap_atomic(page) + offset; in swap_count_continued()
3823 *map -= 1; in swap_count_continued()
3827 while ((page = list_prev_entry(page, lru)) != head) { in swap_count_continued()
3828 map = kmap_atomic(page) + offset; in swap_count_continued()
3836 spin_unlock(&si->cont_lock); in swap_count_continued()
3841 * free_swap_count_continuations - swapoff free all the continuation pages
3846 pgoff_t offset; in free_swap_count_continuations() local
3848 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { in free_swap_count_continuations()
3849 struct page *head; in free_swap_count_continuations()
3850 head = vmalloc_to_page(si->swap_map + offset); in free_swap_count_continuations()
3852 struct page *page, *next; in free_swap_count_continuations() local
3854 list_for_each_entry_safe(page, next, &head->lru, lru) { in free_swap_count_continuations()
3855 list_del(&page->lru); in free_swap_count_continuations()
3856 __free_page(page); in free_swap_count_continuations()
3863 void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) in cgroup_throttle_swaprate() argument
3866 int nid = page_to_nid(page); in cgroup_throttle_swaprate()
3878 if (current->throttle_queue) in cgroup_throttle_swaprate()
3884 if (si->bdev) { in cgroup_throttle_swaprate()
3885 blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); in cgroup_throttle_swaprate()
3901 return -ENOMEM; in swapfile_init()