• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *	of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *	2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17 
18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
19 #define _XOPEN_SOURCE 600
20 #define _DARWIN_C_SOURCE
21 #define _FILE_OFFSET_BITS 64
22 #ifndef _LARGEFILE_SOURCE
23 #define _LARGEFILE_SOURCE
24 #endif
25 #ifndef _LARGEFILE64_SOURCE
26 #define _LARGEFILE64_SOURCE
27 #endif
28 #ifndef _GNU_SOURCE
29 #define _GNU_SOURCE
30 #endif
31 #endif
32 
33 #include "config.h"
34 #include <stdio.h>
35 #include <string.h>
36 #if HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #if HAVE_ERRNO_H
40 #include <errno.h>
41 #endif
42 #include <fcntl.h>
43 #include <time.h>
44 #ifdef __linux__
45 #include <sys/utsname.h>
46 #endif
47 #if HAVE_SYS_TYPES_H
48 #include <sys/types.h>
49 #endif
50 #ifdef HAVE_SYS_IOCTL_H
51 #include <sys/ioctl.h>
52 #endif
53 #ifdef HAVE_SYS_MOUNT_H
54 #include <sys/mount.h>
55 #endif
56 #ifdef HAVE_SYS_PRCTL_H
57 #include <sys/prctl.h>
58 #else
59 #define PR_GET_DUMPABLE 3
60 #endif
61 #if HAVE_SYS_STAT_H
62 #include <sys/stat.h>
63 #endif
64 #if HAVE_SYS_RESOURCE_H
65 #include <sys/resource.h>
66 #endif
67 #if HAVE_LINUX_FALLOC_H
68 #include <linux/falloc.h>
69 #endif
70 #ifdef HAVE_PTHREAD
71 #include <pthread.h>
72 #endif
73 
74 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
75 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
76 #endif
77 
78 #undef ALIGN_DEBUG
79 
80 #include "ext2_fs.h"
81 #include "ext2fs.h"
82 #include "ext2fsP.h"
83 
84 /*
85  * For checking structure magic numbers...
86  */
87 
88 #define EXT2_CHECK_MAGIC(struct, code) \
89 	  if ((struct)->magic != (code)) return (code)
90 
91 struct unix_cache {
92 	char			*buf;
93 	unsigned long long	block;
94 	int			access_time;
95 	unsigned		dirty:1;
96 	unsigned		in_use:1;
97 };
98 
99 #define CACHE_SIZE 8
100 #define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
101 #define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
102 
103 struct unix_private_data {
104 	int	magic;
105 	int	dev;
106 	int	flags;
107 	int	align;
108 	int	access_time;
109 	ext2_loff_t offset;
110 	struct unix_cache cache[CACHE_SIZE];
111 	void	*bounce;
112 	struct struct_io_stats io_stats;
113 #ifdef HAVE_PTHREAD
114 	pthread_mutex_t cache_mutex;
115 	pthread_mutex_t bounce_mutex;
116 	pthread_mutex_t stats_mutex;
117 #endif
118 };
119 
120 #define IS_ALIGNED(n, align) ((((uintptr_t) n) & \
121 			       ((uintptr_t) ((align)-1))) == 0)
122 
123 typedef enum lock_kind {
124 	CACHE_MTX, BOUNCE_MTX, STATS_MTX
125 } kind_t;
126 
127 #ifdef HAVE_PTHREAD
get_mutex(struct unix_private_data * data,kind_t kind)128 static inline pthread_mutex_t *get_mutex(struct unix_private_data *data,
129 					 kind_t kind)
130 {
131 	if (data->flags & IO_FLAG_THREADS) {
132 		switch (kind) {
133 		case CACHE_MTX:
134 			return &data->cache_mutex;
135 		case BOUNCE_MTX:
136 			return &data->bounce_mutex;
137 		case STATS_MTX:
138 			return &data->stats_mutex;
139 		}
140 	}
141 	return NULL;
142 }
143 #endif
144 
mutex_lock(struct unix_private_data * data,kind_t kind)145 static inline void mutex_lock(struct unix_private_data *data, kind_t kind)
146 {
147 #ifdef HAVE_PTHREAD
148 	pthread_mutex_t *mtx = get_mutex(data,kind);
149 
150 	if (mtx)
151 		pthread_mutex_lock(mtx);
152 #endif
153 }
154 
mutex_unlock(struct unix_private_data * data,kind_t kind)155 static inline void mutex_unlock(struct unix_private_data *data, kind_t kind)
156 {
157 #ifdef HAVE_PTHREAD
158 	pthread_mutex_t *mtx = get_mutex(data,kind);
159 
160 	if (mtx)
161 		pthread_mutex_unlock(mtx);
162 #endif
163 }
164 
unix_get_stats(io_channel channel,io_stats * stats)165 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
166 {
167 	errcode_t	retval = 0;
168 
169 	struct unix_private_data *data;
170 
171 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
172 	data = (struct unix_private_data *) channel->private_data;
173 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
174 
175 	if (stats) {
176 		mutex_lock(data, STATS_MTX);
177 		*stats = &data->io_stats;
178 		mutex_unlock(data, STATS_MTX);
179 	}
180 
181 	return retval;
182 }
183 
safe_getenv(const char * arg)184 static char *safe_getenv(const char *arg)
185 {
186 	if ((getuid() != geteuid()) || (getgid() != getegid()))
187 		return NULL;
188 #ifdef HAVE_PRCTL
189 	if (prctl(PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
190 		return NULL;
191 #else
192 #if (defined(linux) && defined(SYS_prctl))
193 	if (syscall(SYS_prctl, PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
194 		return NULL;
195 #endif
196 #endif
197 
198 #if defined(HAVE_SECURE_GETENV)
199 	return secure_getenv(arg);
200 #elif defined(HAVE___SECURE_GETENV)
201 	return __secure_getenv(arg);
202 #else
203 	return getenv(arg);
204 #endif
205 }
206 
207 /*
208  * Here are the raw I/O functions
209  */
raw_read_blk(io_channel channel,struct unix_private_data * data,unsigned long long block,int count,void * bufv)210 static errcode_t raw_read_blk(io_channel channel,
211 			      struct unix_private_data *data,
212 			      unsigned long long block,
213 			      int count, void *bufv)
214 {
215 	errcode_t	retval;
216 	ssize_t		size;
217 	ext2_loff_t	location;
218 	int		actual = 0;
219 	unsigned char	*buf = bufv;
220 	ssize_t		really_read = 0;
221 	unsigned long long aligned_blk;
222 	int		align_size, offset;
223 
224 	size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
225 	mutex_lock(data, STATS_MTX);
226 	data->io_stats.bytes_read += size;
227 	mutex_unlock(data, STATS_MTX);
228 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
229 
230 	if (data->flags & IO_FLAG_FORCE_BOUNCE)
231 		goto bounce_read;
232 
233 #ifdef HAVE_PREAD64
234 	/* Try an aligned pread */
235 	if ((channel->align == 0) ||
236 	    (IS_ALIGNED(buf, channel->align) &&
237 	     IS_ALIGNED(location, channel->align) &&
238 	     IS_ALIGNED(size, channel->align))) {
239 		actual = pread64(data->dev, buf, size, location);
240 		if (actual == size)
241 			return 0;
242 		actual = 0;
243 	}
244 #elif HAVE_PREAD
245 	/* Try an aligned pread */
246 	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
247 	    ((channel->align == 0) ||
248 	     (IS_ALIGNED(buf, channel->align) &&
249 	      IS_ALIGNED(location, channel->align) &&
250 	      IS_ALIGNED(size, channel->align)))) {
251 		actual = pread(data->dev, buf, size, location);
252 		if (actual == size)
253 			return 0;
254 		actual = 0;
255 	}
256 #endif /* HAVE_PREAD */
257 
258 	if ((channel->align == 0) ||
259 	    (IS_ALIGNED(buf, channel->align) &&
260 	     IS_ALIGNED(location, channel->align) &&
261 	     IS_ALIGNED(size, channel->align))) {
262 		mutex_lock(data, BOUNCE_MTX);
263 		if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
264 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
265 			goto error_unlock;
266 		}
267 		actual = read(data->dev, buf, size);
268 		if (actual != size) {
269 		short_read:
270 			if (actual < 0) {
271 				retval = errno;
272 				actual = 0;
273 			} else
274 				retval = EXT2_ET_SHORT_READ;
275 			goto error_unlock;
276 		}
277 		goto success_unlock;
278 	}
279 
280 #ifdef ALIGN_DEBUG
281 	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
282 	       (unsigned long) size);
283 #endif
284 
285 	/*
286 	 * The buffer or size which we're trying to read isn't aligned
287 	 * to the O_DIRECT rules, so we need to do this the hard way...
288 	 */
289 bounce_read:
290 	if (channel->align == 0)
291 		channel->align = 1;
292 	if ((channel->block_size > channel->align) &&
293 	    (channel->block_size % channel->align) == 0)
294 		align_size = channel->block_size;
295 	else
296 		align_size = channel->align;
297 	aligned_blk = location / align_size;
298 	offset = location % align_size;
299 
300 	mutex_lock(data, BOUNCE_MTX);
301 	if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
302 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
303 		goto error_unlock;
304 	}
305 	while (size > 0) {
306 		actual = read(data->dev, data->bounce, align_size);
307 		if (actual != align_size) {
308 			mutex_unlock(data, BOUNCE_MTX);
309 			actual = really_read;
310 			buf -= really_read;
311 			size += really_read;
312 			goto short_read;
313 		}
314 		actual = size;
315 		if (actual > align_size)
316 			actual = align_size;
317 		actual -= offset;
318 		memcpy(buf, data->bounce + offset, actual);
319 
320 		really_read += actual;
321 		size -= actual;
322 		buf += actual;
323 		offset = 0;
324 		aligned_blk++;
325 	}
326 success_unlock:
327 	mutex_unlock(data, BOUNCE_MTX);
328 	return 0;
329 
330 error_unlock:
331 	mutex_unlock(data, BOUNCE_MTX);
332 	if (actual >= 0 && actual < size)
333 		memset((char *) buf+actual, 0, size-actual);
334 	if (channel->read_error)
335 		retval = (channel->read_error)(channel, block, count, buf,
336 					       size, actual, retval);
337 	return retval;
338 }
339 
raw_write_blk(io_channel channel,struct unix_private_data * data,unsigned long long block,int count,const void * bufv)340 static errcode_t raw_write_blk(io_channel channel,
341 			       struct unix_private_data *data,
342 			       unsigned long long block,
343 			       int count, const void *bufv)
344 {
345 	ssize_t		size;
346 	ext2_loff_t	location;
347 	int		actual = 0;
348 	errcode_t	retval;
349 	const unsigned char *buf = bufv;
350 	unsigned long long aligned_blk;
351 	int		align_size, offset;
352 
353 	if (count == 1)
354 		size = channel->block_size;
355 	else {
356 		if (count < 0)
357 			size = -count;
358 		else
359 			size = (ext2_loff_t) count * channel->block_size;
360 	}
361 	mutex_lock(data, STATS_MTX);
362 	data->io_stats.bytes_written += size;
363 	mutex_unlock(data, STATS_MTX);
364 
365 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
366 
367 	if (data->flags & IO_FLAG_FORCE_BOUNCE)
368 		goto bounce_write;
369 
370 #ifdef HAVE_PWRITE64
371 	/* Try an aligned pwrite */
372 	if ((channel->align == 0) ||
373 	    (IS_ALIGNED(buf, channel->align) &&
374 	     IS_ALIGNED(location, channel->align) &&
375 	     IS_ALIGNED(size, channel->align))) {
376 		actual = pwrite64(data->dev, buf, size, location);
377 		if (actual == size)
378 			return 0;
379 	}
380 #elif HAVE_PWRITE
381 	/* Try an aligned pwrite */
382 	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
383 	    ((channel->align == 0) ||
384 	     (IS_ALIGNED(buf, channel->align) &&
385 	      IS_ALIGNED(location, channel->align) &&
386 	      IS_ALIGNED(size, channel->align)))) {
387 		actual = pwrite(data->dev, buf, size, location);
388 		if (actual == size)
389 			return 0;
390 	}
391 #endif /* HAVE_PWRITE */
392 
393 	if ((channel->align == 0) ||
394 	    (IS_ALIGNED(buf, channel->align) &&
395 	     IS_ALIGNED(location, channel->align) &&
396 	     IS_ALIGNED(size, channel->align))) {
397 		mutex_lock(data, BOUNCE_MTX);
398 		if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
399 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
400 			goto error_out;
401 		}
402 		actual = write(data->dev, buf, size);
403 		mutex_unlock(data, BOUNCE_MTX);
404 		if (actual < 0) {
405 			retval = errno;
406 			goto error_out;
407 		}
408 		if (actual != size) {
409 		short_write:
410 			retval = EXT2_ET_SHORT_WRITE;
411 			goto error_out;
412 		}
413 		return 0;
414 	}
415 
416 #ifdef ALIGN_DEBUG
417 	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
418 	       (unsigned long) size);
419 #endif
420 	/*
421 	 * The buffer or size which we're trying to write isn't aligned
422 	 * to the O_DIRECT rules, so we need to do this the hard way...
423 	 */
424 bounce_write:
425 	if (channel->align == 0)
426 		channel->align = 1;
427 	if ((channel->block_size > channel->align) &&
428 	    (channel->block_size % channel->align) == 0)
429 		align_size = channel->block_size;
430 	else
431 		align_size = channel->align;
432 	aligned_blk = location / align_size;
433 	offset = location % align_size;
434 
435 	while (size > 0) {
436 		int actual_w;
437 
438 		mutex_lock(data, BOUNCE_MTX);
439 		if (size < align_size || offset) {
440 			if (ext2fs_llseek(data->dev, aligned_blk * align_size,
441 					  SEEK_SET) < 0) {
442 				retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
443 				goto error_unlock;
444 			}
445 			actual = read(data->dev, data->bounce,
446 				      align_size);
447 			if (actual != align_size) {
448 				if (actual < 0) {
449 					retval = errno;
450 					goto error_unlock;
451 				}
452 				memset((char *) data->bounce + actual, 0,
453 				       align_size - actual);
454 			}
455 		}
456 		actual = size;
457 		if (actual > align_size)
458 			actual = align_size;
459 		actual -= offset;
460 		memcpy(((char *)data->bounce) + offset, buf, actual);
461 		if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
462 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
463 			goto error_unlock;
464 		}
465 		actual_w = write(data->dev, data->bounce, align_size);
466 		mutex_unlock(data, BOUNCE_MTX);
467 		if (actual_w < 0) {
468 			retval = errno;
469 			goto error_out;
470 		}
471 		if (actual_w != align_size)
472 			goto short_write;
473 		size -= actual;
474 		buf += actual;
475 		location += actual;
476 		aligned_blk++;
477 		offset = 0;
478 	}
479 	return 0;
480 
481 error_unlock:
482 	mutex_unlock(data, BOUNCE_MTX);
483 error_out:
484 	if (channel->write_error)
485 		retval = (channel->write_error)(channel, block, count, buf,
486 						size, actual, retval);
487 	return retval;
488 }
489 
490 
491 /*
492  * Here we implement the cache functions
493  */
494 
495 /* Allocate the cache buffers */
alloc_cache(io_channel channel,struct unix_private_data * data)496 static errcode_t alloc_cache(io_channel channel,
497 			     struct unix_private_data *data)
498 {
499 	errcode_t		retval;
500 	struct unix_cache	*cache;
501 	int			i;
502 
503 	data->access_time = 0;
504 	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
505 		cache->block = 0;
506 		cache->access_time = 0;
507 		cache->dirty = 0;
508 		cache->in_use = 0;
509 		if (cache->buf)
510 			ext2fs_free_mem(&cache->buf);
511 		retval = io_channel_alloc_buf(channel, 0, &cache->buf);
512 		if (retval)
513 			return retval;
514 	}
515 	if (channel->align || data->flags & IO_FLAG_FORCE_BOUNCE) {
516 		if (data->bounce)
517 			ext2fs_free_mem(&data->bounce);
518 		retval = io_channel_alloc_buf(channel, 0, &data->bounce);
519 	}
520 	return retval;
521 }
522 
523 /* Free the cache buffers */
free_cache(struct unix_private_data * data)524 static void free_cache(struct unix_private_data *data)
525 {
526 	struct unix_cache	*cache;
527 	int			i;
528 
529 	data->access_time = 0;
530 	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
531 		cache->block = 0;
532 		cache->access_time = 0;
533 		cache->dirty = 0;
534 		cache->in_use = 0;
535 		if (cache->buf)
536 			ext2fs_free_mem(&cache->buf);
537 	}
538 	if (data->bounce)
539 		ext2fs_free_mem(&data->bounce);
540 }
541 
542 #ifndef NO_IO_CACHE
543 /*
544  * Try to find a block in the cache.  If the block is not found, and
545  * eldest is a non-zero pointer, then fill in eldest with the cache
546  * entry to that should be reused.
547  */
find_cached_block(struct unix_private_data * data,unsigned long long block,struct unix_cache ** eldest)548 static struct unix_cache *find_cached_block(struct unix_private_data *data,
549 					    unsigned long long block,
550 					    struct unix_cache **eldest)
551 {
552 	struct unix_cache	*cache, *unused_cache, *oldest_cache;
553 	int			i;
554 
555 	unused_cache = oldest_cache = 0;
556 	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
557 		if (!cache->in_use) {
558 			if (!unused_cache)
559 				unused_cache = cache;
560 			continue;
561 		}
562 		if (cache->block == block) {
563 			cache->access_time = ++data->access_time;
564 			return cache;
565 		}
566 		if (!oldest_cache ||
567 		    (cache->access_time < oldest_cache->access_time))
568 			oldest_cache = cache;
569 	}
570 	if (eldest)
571 		*eldest = (unused_cache) ? unused_cache : oldest_cache;
572 	return 0;
573 }
574 
575 /*
576  * Reuse a particular cache entry for another block.
577  */
reuse_cache(io_channel channel,struct unix_private_data * data,struct unix_cache * cache,unsigned long long block)578 static void reuse_cache(io_channel channel, struct unix_private_data *data,
579 		 struct unix_cache *cache, unsigned long long block)
580 {
581 	if (cache->dirty && cache->in_use)
582 		raw_write_blk(channel, data, cache->block, 1, cache->buf);
583 
584 	cache->in_use = 1;
585 	cache->dirty = 0;
586 	cache->block = block;
587 	cache->access_time = ++data->access_time;
588 }
589 
590 #define FLUSH_INVALIDATE	0x01
591 #define FLUSH_NOLOCK		0x02
592 
593 /*
594  * Flush all of the blocks in the cache
595  */
flush_cached_blocks(io_channel channel,struct unix_private_data * data,int flags)596 static errcode_t flush_cached_blocks(io_channel channel,
597 				     struct unix_private_data *data,
598 				     int flags)
599 {
600 	struct unix_cache	*cache;
601 	errcode_t		retval, retval2;
602 	int			i;
603 
604 	retval2 = 0;
605 	if ((flags & FLUSH_NOLOCK) == 0)
606 		mutex_lock(data, CACHE_MTX);
607 	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
608 		if (!cache->in_use)
609 			continue;
610 
611 		if (flags & FLUSH_INVALIDATE)
612 			cache->in_use = 0;
613 
614 		if (!cache->dirty)
615 			continue;
616 
617 		retval = raw_write_blk(channel, data,
618 				       cache->block, 1, cache->buf);
619 		if (retval)
620 			retval2 = retval;
621 		else
622 			cache->dirty = 0;
623 	}
624 	if ((flags & FLUSH_NOLOCK) == 0)
625 		mutex_unlock(data, CACHE_MTX);
626 	return retval2;
627 }
628 #endif /* NO_IO_CACHE */
629 
630 #ifdef __linux__
631 #ifndef BLKDISCARDZEROES
632 #define BLKDISCARDZEROES _IO(0x12,124)
633 #endif
634 #endif
635 
ext2fs_open_file(const char * pathname,int flags,mode_t mode)636 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
637 {
638 	if (mode)
639 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
640 		return open64(pathname, flags, mode);
641 	else
642 		return open64(pathname, flags);
643 #else
644 		return open(pathname, flags, mode);
645 	else
646 		return open(pathname, flags);
647 #endif
648 }
649 
ext2fs_stat(const char * path,ext2fs_struct_stat * buf)650 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
651 {
652 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
653 	return stat64(path, buf);
654 #else
655 	return stat(path, buf);
656 #endif
657 }
658 
ext2fs_fstat(int fd,ext2fs_struct_stat * buf)659 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
660 {
661 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
662 	return fstat64(fd, buf);
663 #else
664 	return fstat(fd, buf);
665 #endif
666 }
667 
668 
unix_open_channel(const char * name,int fd,int flags,io_channel * channel,io_manager io_mgr)669 static errcode_t unix_open_channel(const char *name, int fd,
670 				   int flags, io_channel *channel,
671 				   io_manager io_mgr)
672 {
673 	io_channel	io = NULL;
674 	struct unix_private_data *data = NULL;
675 	errcode_t	retval;
676 	ext2fs_struct_stat st;
677 #ifdef __linux__
678 	struct		utsname ut;
679 #endif
680 
681 	if (safe_getenv("UNIX_IO_FORCE_BOUNCE"))
682 		flags |= IO_FLAG_FORCE_BOUNCE;
683 
684 #ifdef __linux__
685 	/*
686 	 * We need to make sure any previous errors in the block
687 	 * device are thrown away, sigh.
688 	 */
689 	(void) fsync(fd);
690 #endif
691 
692 	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
693 	if (retval)
694 		goto cleanup;
695 	memset(io, 0, sizeof(struct struct_io_channel));
696 	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
697 	retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
698 	if (retval)
699 		goto cleanup;
700 
701 	io->manager = io_mgr;
702 	retval = ext2fs_get_mem(strlen(name)+1, &io->name);
703 	if (retval)
704 		goto cleanup;
705 
706 	strcpy(io->name, name);
707 	io->private_data = data;
708 	io->block_size = 1024;
709 	io->read_error = 0;
710 	io->write_error = 0;
711 	io->refcount = 1;
712 	io->flags = 0;
713 
714 	memset(data, 0, sizeof(struct unix_private_data));
715 	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
716 	data->io_stats.num_fields = 2;
717 	data->flags = flags;
718 	data->dev = fd;
719 
720 #if defined(O_DIRECT)
721 	if (flags & IO_FLAG_DIRECT_IO)
722 		io->align = ext2fs_get_dio_alignment(data->dev);
723 #elif defined(F_NOCACHE)
724 	if (flags & IO_FLAG_DIRECT_IO)
725 		io->align = 4096;
726 #endif
727 
728 	/*
729 	 * If the device is really a block device, then set the
730 	 * appropriate flag, otherwise we can set DISCARD_ZEROES flag
731 	 * because we are going to use punch hole instead of discard
732 	 * and if it succeed, subsequent read from sparse area returns
733 	 * zero.
734 	 */
735 	if (ext2fs_fstat(data->dev, &st) == 0) {
736 		if (ext2fsP_is_disk_device(st.st_mode))
737 			io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
738 		else
739 			io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
740 	}
741 
742 #ifdef BLKDISCARDZEROES
743 	{
744 		int zeroes = 0;
745 		if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
746 		    zeroes)
747 			io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
748 	}
749 #endif
750 
751 #if defined(__CYGWIN__)
752 	/*
753 	 * Some operating systems require that the buffers be aligned,
754 	 * regardless of O_DIRECT
755 	 */
756 	if (!io->align)
757 		io->align = 512;
758 #endif
759 
760 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
761 	if (io->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
762 		int dio_align = ext2fs_get_dio_alignment(fd);
763 
764 		if (io->align < dio_align)
765 			io->align = dio_align;
766 	}
767 #endif
768 
769 	if ((retval = alloc_cache(io, data)))
770 		goto cleanup;
771 
772 #ifdef BLKROGET
773 	if (flags & IO_FLAG_RW) {
774 		int error;
775 		int readonly = 0;
776 
777 		/* Is the block device actually writable? */
778 		error = ioctl(data->dev, BLKROGET, &readonly);
779 		if (!error && readonly) {
780 			retval = EPERM;
781 			goto cleanup;
782 		}
783 	}
784 #endif
785 
786 #ifdef __linux__
787 #undef RLIM_INFINITY
788 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
789 #define RLIM_INFINITY	((unsigned long)(~0UL>>1))
790 #else
791 #define RLIM_INFINITY  (~0UL)
792 #endif
793 	/*
794 	 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
795 	 * block devices are wrongly getting hit by the filesize
796 	 * limit.  This workaround isn't perfect, since it won't work
797 	 * if glibc wasn't built against 2.2 header files.  (Sigh.)
798 	 *
799 	 */
800 	if ((flags & IO_FLAG_RW) &&
801 	    (uname(&ut) == 0) &&
802 	    ((ut.release[0] == '2') && (ut.release[1] == '.') &&
803 	     (ut.release[2] == '4') && (ut.release[3] == '.') &&
804 	     (ut.release[4] == '1') && (ut.release[5] >= '0') &&
805 	     (ut.release[5] < '8')) &&
806 	    (ext2fs_fstat(data->dev, &st) == 0) &&
807 	    (ext2fsP_is_disk_device(st.st_mode))) {
808 		struct rlimit	rlim;
809 
810 		rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
811 		setrlimit(RLIMIT_FSIZE, &rlim);
812 		getrlimit(RLIMIT_FSIZE, &rlim);
813 		if (((unsigned long) rlim.rlim_cur) <
814 		    ((unsigned long) rlim.rlim_max)) {
815 			rlim.rlim_cur = rlim.rlim_max;
816 			setrlimit(RLIMIT_FSIZE, &rlim);
817 		}
818 	}
819 #endif
820 #ifdef HAVE_PTHREAD
821 	if (flags & IO_FLAG_THREADS) {
822 		io->flags |= CHANNEL_FLAGS_THREADS;
823 		retval = pthread_mutex_init(&data->cache_mutex, NULL);
824 		if (retval)
825 			goto cleanup;
826 		retval = pthread_mutex_init(&data->bounce_mutex, NULL);
827 		if (retval) {
828 			pthread_mutex_destroy(&data->cache_mutex);
829 			goto cleanup;
830 		}
831 		retval = pthread_mutex_init(&data->stats_mutex, NULL);
832 		if (retval) {
833 			pthread_mutex_destroy(&data->cache_mutex);
834 			pthread_mutex_destroy(&data->bounce_mutex);
835 			goto cleanup;
836 		}
837 	}
838 #endif
839 	*channel = io;
840 	return 0;
841 
842 cleanup:
843 	if (data) {
844 		if (data->dev >= 0)
845 			close(data->dev);
846 		free_cache(data);
847 		ext2fs_free_mem(&data);
848 	}
849 	if (io) {
850 		if (io->name) {
851 			ext2fs_free_mem(&io->name);
852 		}
853 		ext2fs_free_mem(&io);
854 	}
855 	return retval;
856 }
857 
unixfd_open(const char * str_fd,int flags,io_channel * channel)858 static errcode_t unixfd_open(const char *str_fd, int flags,
859 			     io_channel *channel)
860 {
861 	int fd;
862 	int fd_flags;
863 
864 	fd = atoi(str_fd);
865 #if defined(HAVE_FCNTL)
866 	fd_flags = fcntl(fd, F_GETFD);
867 	if (fd_flags == -1)
868 		return EBADF;
869 
870 	flags = 0;
871 	if (fd_flags & O_RDWR)
872 		flags |= IO_FLAG_RW;
873 	if (fd_flags & O_EXCL)
874 		flags |= IO_FLAG_EXCLUSIVE;
875 #if defined(O_DIRECT)
876 	if (fd_flags & O_DIRECT)
877 		flags |= IO_FLAG_DIRECT_IO;
878 #endif
879 #endif  /* HAVE_FCNTL */
880 
881 	return unix_open_channel(str_fd, fd, flags, channel, unixfd_io_manager);
882 }
883 
unix_open(const char * name,int flags,io_channel * channel)884 static errcode_t unix_open(const char *name, int flags,
885 			   io_channel *channel)
886 {
887 	int fd = -1;
888 	int open_flags;
889 
890 	if (name == 0)
891 		return EXT2_ET_BAD_DEVICE_NAME;
892 
893 	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
894 	if (flags & IO_FLAG_EXCLUSIVE)
895 		open_flags |= O_EXCL;
896 #if defined(O_DIRECT)
897 	if (flags & IO_FLAG_DIRECT_IO)
898 		open_flags |= O_DIRECT;
899 #endif
900 	fd = ext2fs_open_file(name, open_flags, 0);
901 	if (fd < 0)
902 		return errno;
903 #if defined(F_NOCACHE) && !defined(IO_DIRECT)
904 	if (flags & IO_FLAG_DIRECT_IO) {
905 		if (fcntl(fd, F_NOCACHE, 1) < 0)
906 			return errno;
907 	}
908 #endif
909 	return unix_open_channel(name, fd, flags, channel, unix_io_manager);
910 }
911 
unix_close(io_channel channel)912 static errcode_t unix_close(io_channel channel)
913 {
914 	struct unix_private_data *data;
915 	errcode_t	retval = 0;
916 
917 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
918 	data = (struct unix_private_data *) channel->private_data;
919 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
920 
921 	if (--channel->refcount > 0)
922 		return 0;
923 
924 #ifndef NO_IO_CACHE
925 	retval = flush_cached_blocks(channel, data, 0);
926 #endif
927 
928 	if (close(data->dev) < 0)
929 		retval = errno;
930 	free_cache(data);
931 #ifdef HAVE_PTHREAD
932 	if (data->flags & IO_FLAG_THREADS) {
933 		pthread_mutex_destroy(&data->cache_mutex);
934 		pthread_mutex_destroy(&data->bounce_mutex);
935 		pthread_mutex_destroy(&data->stats_mutex);
936 	}
937 #endif
938 
939 	ext2fs_free_mem(&channel->private_data);
940 	if (channel->name)
941 		ext2fs_free_mem(&channel->name);
942 	ext2fs_free_mem(&channel);
943 	return retval;
944 }
945 
unix_set_blksize(io_channel channel,int blksize)946 static errcode_t unix_set_blksize(io_channel channel, int blksize)
947 {
948 	struct unix_private_data *data;
949 	errcode_t		retval = 0;
950 
951 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
952 	data = (struct unix_private_data *) channel->private_data;
953 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
954 
955 	if (channel->block_size != blksize) {
956 		mutex_lock(data, CACHE_MTX);
957 		mutex_lock(data, BOUNCE_MTX);
958 #ifndef NO_IO_CACHE
959 		if ((retval = flush_cached_blocks(channel, data, FLUSH_NOLOCK)))
960 			return retval;
961 #endif
962 
963 		channel->block_size = blksize;
964 		free_cache(data);
965 		retval = alloc_cache(channel, data);
966 		mutex_unlock(data, BOUNCE_MTX);
967 		mutex_unlock(data, CACHE_MTX);
968 	}
969 	return retval;
970 }
971 
unix_read_blk64(io_channel channel,unsigned long long block,int count,void * buf)972 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
973 			       int count, void *buf)
974 {
975 	struct unix_private_data *data;
976 	struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
977 	errcode_t	retval = 0;
978 	char		*cp;
979 	int		i, j;
980 
981 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
982 	data = (struct unix_private_data *) channel->private_data;
983 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
984 
985 #ifdef NO_IO_CACHE
986 	return raw_read_blk(channel, data, block, count, buf);
987 #else
988 	if (data->flags & IO_FLAG_NOCACHE)
989 		return raw_read_blk(channel, data, block, count, buf);
990 	/*
991 	 * If we're doing an odd-sized read or a very large read,
992 	 * flush out the cache and then do a direct read.
993 	 */
994 	if (count < 0 || count > WRITE_DIRECT_SIZE) {
995 		if ((retval = flush_cached_blocks(channel, data, 0)))
996 			return retval;
997 		return raw_read_blk(channel, data, block, count, buf);
998 	}
999 
1000 	cp = buf;
1001 	mutex_lock(data, CACHE_MTX);
1002 	while (count > 0) {
1003 		/* If it's in the cache, use it! */
1004 		if ((cache = find_cached_block(data, block, &reuse[0]))) {
1005 #ifdef DEBUG
1006 			printf("Using cached block %lu\n", block);
1007 #endif
1008 			memcpy(cp, cache->buf, channel->block_size);
1009 			count--;
1010 			block++;
1011 			cp += channel->block_size;
1012 			continue;
1013 		}
1014 		if (count == 1) {
1015 			/*
1016 			 * Special case where we read directly into the
1017 			 * cache buffer; important in the O_DIRECT case
1018 			 */
1019 			cache = reuse[0];
1020 			reuse_cache(channel, data, cache, block);
1021 			if ((retval = raw_read_blk(channel, data, block, 1,
1022 						   cache->buf))) {
1023 				cache->in_use = 0;
1024 				break;
1025 			}
1026 			memcpy(cp, cache->buf, channel->block_size);
1027 			retval = 0;
1028 			break;
1029 		}
1030 
1031 		/*
1032 		 * Find the number of uncached blocks so we can do a
1033 		 * single read request
1034 		 */
1035 		for (i=1; i < count; i++)
1036 			if (find_cached_block(data, block+i, &reuse[i]))
1037 				break;
1038 #ifdef DEBUG
1039 		printf("Reading %d blocks starting at %lu\n", i, block);
1040 #endif
1041 		if ((retval = raw_read_blk(channel, data, block, i, cp)))
1042 			break;
1043 
1044 		/* Save the results in the cache */
1045 		for (j=0; j < i; j++) {
1046 			count--;
1047 			cache = reuse[j];
1048 			reuse_cache(channel, data, cache, block++);
1049 			memcpy(cache->buf, cp, channel->block_size);
1050 			cp += channel->block_size;
1051 		}
1052 	}
1053 	mutex_unlock(data, CACHE_MTX);
1054 	return retval;
1055 #endif /* NO_IO_CACHE */
1056 }
1057 
unix_read_blk(io_channel channel,unsigned long block,int count,void * buf)1058 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
1059 			       int count, void *buf)
1060 {
1061 	return unix_read_blk64(channel, block, count, buf);
1062 }
1063 
unix_write_blk64(io_channel channel,unsigned long long block,int count,const void * buf)1064 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
1065 				int count, const void *buf)
1066 {
1067 	struct unix_private_data *data;
1068 	struct unix_cache *cache, *reuse;
1069 	errcode_t	retval = 0;
1070 	const char	*cp;
1071 	int		writethrough;
1072 
1073 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1074 	data = (struct unix_private_data *) channel->private_data;
1075 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1076 
1077 #ifdef NO_IO_CACHE
1078 	return raw_write_blk(channel, data, block, count, buf);
1079 #else
1080 	if (data->flags & IO_FLAG_NOCACHE)
1081 		return raw_write_blk(channel, data, block, count, buf);
1082 	/*
1083 	 * If we're doing an odd-sized write or a very large write,
1084 	 * flush out the cache completely and then do a direct write.
1085 	 */
1086 	if (count < 0 || count > WRITE_DIRECT_SIZE) {
1087 		if ((retval = flush_cached_blocks(channel, data,
1088 						  FLUSH_INVALIDATE)))
1089 			return retval;
1090 		return raw_write_blk(channel, data, block, count, buf);
1091 	}
1092 
1093 	/*
1094 	 * For a moderate-sized multi-block write, first force a write
1095 	 * if we're in write-through cache mode, and then fill the
1096 	 * cache with the blocks.
1097 	 */
1098 	writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
1099 	if (writethrough)
1100 		retval = raw_write_blk(channel, data, block, count, buf);
1101 
1102 	cp = buf;
1103 	mutex_lock(data, CACHE_MTX);
1104 	while (count > 0) {
1105 		cache = find_cached_block(data, block, &reuse);
1106 		if (!cache) {
1107 			cache = reuse;
1108 			reuse_cache(channel, data, cache, block);
1109 		}
1110 		if (cache->buf != cp)
1111 			memcpy(cache->buf, cp, channel->block_size);
1112 		cache->dirty = !writethrough;
1113 		count--;
1114 		block++;
1115 		cp += channel->block_size;
1116 	}
1117 	mutex_unlock(data, CACHE_MTX);
1118 	return retval;
1119 #endif /* NO_IO_CACHE */
1120 }
1121 
unix_cache_readahead(io_channel channel,unsigned long long block,unsigned long long count)1122 static errcode_t unix_cache_readahead(io_channel channel,
1123 				      unsigned long long block,
1124 				      unsigned long long count)
1125 {
1126 #ifdef POSIX_FADV_WILLNEED
1127 	struct unix_private_data *data;
1128 
1129 	data = (struct unix_private_data *)channel->private_data;
1130 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1131 	return posix_fadvise(data->dev,
1132 			     (ext2_loff_t)block * channel->block_size + data->offset,
1133 			     (ext2_loff_t)count * channel->block_size,
1134 			     POSIX_FADV_WILLNEED);
1135 #else
1136 	return EXT2_ET_OP_NOT_SUPPORTED;
1137 #endif
1138 }
1139 
unix_write_blk(io_channel channel,unsigned long block,int count,const void * buf)1140 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
1141 				int count, const void *buf)
1142 {
1143 	return unix_write_blk64(channel, block, count, buf);
1144 }
1145 
unix_write_byte(io_channel channel,unsigned long offset,int size,const void * buf)1146 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
1147 				 int size, const void *buf)
1148 {
1149 	struct unix_private_data *data;
1150 	errcode_t	retval = 0;
1151 	ssize_t		actual;
1152 
1153 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1154 	data = (struct unix_private_data *) channel->private_data;
1155 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1156 
1157 	if (channel->align != 0) {
1158 #ifdef ALIGN_DEBUG
1159 		printf("unix_write_byte: O_DIRECT fallback\n");
1160 #endif
1161 		return EXT2_ET_UNIMPLEMENTED;
1162 	}
1163 
1164 #ifndef NO_IO_CACHE
1165 	/*
1166 	 * Flush out the cache completely
1167 	 */
1168 	if ((retval = flush_cached_blocks(channel, data, FLUSH_INVALIDATE)))
1169 		return retval;
1170 #endif
1171 
1172 	if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
1173 		return errno;
1174 
1175 	actual = write(data->dev, buf, size);
1176 	if (actual < 0)
1177 		return errno;
1178 	if (actual != size)
1179 		return EXT2_ET_SHORT_WRITE;
1180 
1181 	return 0;
1182 }
1183 
1184 /*
1185  * Flush data buffers to disk.
1186  */
unix_flush(io_channel channel)1187 static errcode_t unix_flush(io_channel channel)
1188 {
1189 	struct unix_private_data *data;
1190 	errcode_t retval = 0;
1191 
1192 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1193 	data = (struct unix_private_data *) channel->private_data;
1194 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1195 
1196 #ifndef NO_IO_CACHE
1197 	retval = flush_cached_blocks(channel, data, 0);
1198 #endif
1199 #ifdef HAVE_FSYNC
1200 	if (!retval && fsync(data->dev) != 0)
1201 		return errno;
1202 #endif
1203 	return retval;
1204 }
1205 
unix_set_option(io_channel channel,const char * option,const char * arg)1206 static errcode_t unix_set_option(io_channel channel, const char *option,
1207 				 const char *arg)
1208 {
1209 	struct unix_private_data *data;
1210 	unsigned long long tmp;
1211 	errcode_t retval;
1212 	char *end;
1213 
1214 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1215 	data = (struct unix_private_data *) channel->private_data;
1216 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1217 
1218 	if (!strcmp(option, "offset")) {
1219 		if (!arg)
1220 			return EXT2_ET_INVALID_ARGUMENT;
1221 
1222 		tmp = strtoull(arg, &end, 0);
1223 		if (*end)
1224 			return EXT2_ET_INVALID_ARGUMENT;
1225 		data->offset = tmp;
1226 		if (data->offset < 0)
1227 			return EXT2_ET_INVALID_ARGUMENT;
1228 		return 0;
1229 	}
1230 	if (!strcmp(option, "cache")) {
1231 		if (!arg)
1232 			return EXT2_ET_INVALID_ARGUMENT;
1233 		if (!strcmp(arg, "on")) {
1234 			data->flags &= ~IO_FLAG_NOCACHE;
1235 			return 0;
1236 		}
1237 		if (!strcmp(arg, "off")) {
1238 			retval = flush_cached_blocks(channel, data, 0);
1239 			data->flags |= IO_FLAG_NOCACHE;
1240 			return retval;
1241 		}
1242 		return EXT2_ET_INVALID_ARGUMENT;
1243 	}
1244 	return EXT2_ET_INVALID_ARGUMENT;
1245 }
1246 
1247 #if defined(__linux__) && !defined(BLKDISCARD)
1248 #define BLKDISCARD		_IO(0x12,119)
1249 #endif
1250 
unix_discard(io_channel channel,unsigned long long block,unsigned long long count)1251 static errcode_t unix_discard(io_channel channel, unsigned long long block,
1252 			      unsigned long long count)
1253 {
1254 	struct unix_private_data *data;
1255 	int		ret;
1256 
1257 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1258 	data = (struct unix_private_data *) channel->private_data;
1259 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1260 
1261 	if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1262 #ifdef BLKDISCARD
1263 		__u64 range[2];
1264 
1265 		range[0] = (__u64)(block) * channel->block_size + data->offset;
1266 		range[1] = (__u64)(count) * channel->block_size;
1267 
1268 		ret = ioctl(data->dev, BLKDISCARD, &range);
1269 #else
1270 		goto unimplemented;
1271 #endif
1272 	} else {
1273 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
1274 		/*
1275 		 * If we are not on block device, try to use punch hole
1276 		 * to reclaim free space.
1277 		 */
1278 		ret = fallocate(data->dev,
1279 				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1280 				(off_t)(block) * channel->block_size + data->offset,
1281 				(off_t)(count) * channel->block_size);
1282 #else
1283 		goto unimplemented;
1284 #endif
1285 	}
1286 	if (ret < 0) {
1287 		if (errno == EOPNOTSUPP)
1288 			goto unimplemented;
1289 		return errno;
1290 	}
1291 	return 0;
1292 unimplemented:
1293 	return EXT2_ET_UNIMPLEMENTED;
1294 }
1295 
1296 /*
1297  * If we know about ZERO_RANGE, try that before we try PUNCH_HOLE because
1298  * ZERO_RANGE doesn't unmap preallocated blocks.  We prefer fallocate because
1299  * it always invalidates page cache, and libext2fs requires that reads after
1300  * ZERO_RANGE return zeroes.
1301  */
__unix_zeroout(int fd,off_t offset,off_t len)1302 static int __unix_zeroout(int fd, off_t offset, off_t len)
1303 {
1304 	int ret = -1;
1305 
1306 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_ZERO_RANGE)
1307 	ret = fallocate(fd, FALLOC_FL_ZERO_RANGE, offset, len);
1308 	if (ret == 0)
1309 		return 0;
1310 #endif
1311 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1312 	ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1313 			offset,  len);
1314 	if (ret == 0)
1315 		return 0;
1316 #endif
1317 	errno = EOPNOTSUPP;
1318 	return ret;
1319 }
1320 
1321 /* parameters might not be used if OS doesn't support zeroout */
1322 #if __GNUC_PREREQ (4, 6)
1323 #pragma GCC diagnostic push
1324 #pragma GCC diagnostic ignored "-Wunused-parameter"
1325 #endif
unix_zeroout(io_channel channel,unsigned long long block,unsigned long long count)1326 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
1327 			      unsigned long long count)
1328 {
1329 	struct unix_private_data *data;
1330 	int		ret;
1331 
1332 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1333 	data = (struct unix_private_data *) channel->private_data;
1334 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1335 
1336 	if (safe_getenv("UNIX_IO_NOZEROOUT"))
1337 		goto unimplemented;
1338 
1339 	if (!(channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE)) {
1340 		/* Regular file, try to use truncate/punch/zero. */
1341 		struct stat statbuf;
1342 
1343 		if (count == 0)
1344 			return 0;
1345 		/*
1346 		 * If we're trying to zero a range past the end of the file,
1347 		 * extend the file size, then truncate everything.
1348 		 */
1349 		ret = fstat(data->dev, &statbuf);
1350 		if (ret)
1351 			goto err;
1352 		if ((unsigned long long) statbuf.st_size <
1353 			(block + count) * channel->block_size + data->offset) {
1354 			ret = ftruncate(data->dev,
1355 					(block + count) * channel->block_size + data->offset);
1356 			if (ret)
1357 				goto err;
1358 		}
1359 	}
1360 
1361 	ret = __unix_zeroout(data->dev,
1362 			(off_t)(block) * channel->block_size + data->offset,
1363 			(off_t)(count) * channel->block_size);
1364 err:
1365 	if (ret < 0) {
1366 		if (errno == EOPNOTSUPP)
1367 			goto unimplemented;
1368 		return errno;
1369 	}
1370 	return 0;
1371 unimplemented:
1372 	return EXT2_ET_UNIMPLEMENTED;
1373 }
1374 #if __GNUC_PREREQ (4, 6)
1375 #pragma GCC diagnostic pop
1376 #endif
1377 
1378 static struct struct_io_manager struct_unix_manager = {
1379 	.magic		= EXT2_ET_MAGIC_IO_MANAGER,
1380 	.name		= "Unix I/O Manager",
1381 	.open		= unix_open,
1382 	.close		= unix_close,
1383 	.set_blksize	= unix_set_blksize,
1384 	.read_blk	= unix_read_blk,
1385 	.write_blk	= unix_write_blk,
1386 	.flush		= unix_flush,
1387 	.write_byte	= unix_write_byte,
1388 	.set_option	= unix_set_option,
1389 	.get_stats	= unix_get_stats,
1390 	.read_blk64	= unix_read_blk64,
1391 	.write_blk64	= unix_write_blk64,
1392 	.discard	= unix_discard,
1393 	.cache_readahead	= unix_cache_readahead,
1394 	.zeroout	= unix_zeroout,
1395 };
1396 
1397 io_manager unix_io_manager = &struct_unix_manager;
1398 
1399 static struct struct_io_manager struct_unixfd_manager = {
1400 	.magic		= EXT2_ET_MAGIC_IO_MANAGER,
1401 	.name		= "Unix fd I/O Manager",
1402 	.open		= unixfd_open,
1403 	.close		= unix_close,
1404 	.set_blksize	= unix_set_blksize,
1405 	.read_blk	= unix_read_blk,
1406 	.write_blk	= unix_write_blk,
1407 	.flush		= unix_flush,
1408 	.write_byte	= unix_write_byte,
1409 	.set_option	= unix_set_option,
1410 	.get_stats	= unix_get_stats,
1411 	.read_blk64	= unix_read_blk64,
1412 	.write_blk64	= unix_write_blk64,
1413 	.discard	= unix_discard,
1414 	.cache_readahead	= unix_cache_readahead,
1415 	.zeroout	= unix_zeroout,
1416 };
1417 
1418 io_manager unixfd_io_manager = &struct_unixfd_manager;
1419