diff --git a/cfg.mk b/cfg.mk
index 4df74f05d0..d0d68cc31c 100644
--- a/cfg.mk
+++ b/cfg.mk
@@ -80,7 +80,7 @@ sc_root_tests:
@if test -d tests \
&& grep check-root tests/Makefile.am>/dev/null 2>&1; then \
t1=sc-root.expected; t2=sc-root.actual; \
- grep -nl '^require_root_$$' \
+ grep -nl '^ *require_root_$$' \
$$($(VC_LIST) tests) |sed s,tests/,, |sort > $$t1; \
sed -n '/^root_tests =[ ]*\\$$/,/[^\]$$/p' \
$(srcdir)/tests/Makefile.am \
diff --git a/src/Makefile.am b/src/Makefile.am
index 91364635fc..bf1d60a886 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -145,6 +145,7 @@ noinst_HEADERS = \
copy.h \
cp-hash.h \
dircolors.h \
+ fiemap.h \
find-mount-point.h \
fs.h \
group-list.h \
@@ -449,7 +450,7 @@ uninstall-local:
fi; \
fi
-copy_sources = copy.c cp-hash.c
+copy_sources = copy.c cp-hash.c extent-scan.c extent-scan.h
# Use `ginstall' in the definition of PROGRAMS and in dependencies to avoid
# confusion with the `install' target. The install rule transforms `ginstall'
diff --git a/src/copy.c b/src/copy.c
index 9a014ad5aa..96bb35b135 100644
--- a/src/copy.c
+++ b/src/copy.c
@@ -36,6 +36,7 @@
#include "buffer-lcm.h"
#include "copy.h"
#include "cp-hash.h"
+#include "extent-scan.h"
#include "error.h"
#include "fcntl--.h"
#include "file-set.h"
@@ -62,6 +63,10 @@
# include "verror.h"
#endif
+#ifndef HAVE_FIEMAP
+# include "fiemap.h"
+#endif
+
#ifndef HAVE_FCHOWN
# define HAVE_FCHOWN false
# define fchown(fd, uid, gid) (-1)
@@ -129,6 +134,122 @@ utimens_symlink (char const *file, struct timespec const *timespec)
return err;
}
+/* Copy the regular file open on SRC_FD/SRC_NAME to DST_FD/DST_NAME,
+ honoring the MAKE_HOLES setting and using the BUF_SIZE-byte buffer
+ BUF for temporary storage. Copy no more than MAX_N_READ bytes.
+ Return true upon successful completion;
+ print a diagnostic and return false upon error.
+ Note that for best results, BUF should be "well"-aligned.
+ BUF must have sizeof(uintptr_t)-1 bytes of additional space
+ beyond BUF[BUF_SIZE-1].
+ Set *LAST_WRITE_MADE_HOLE to true if the final operation on
+ DEST_FD introduced a hole. */
+static bool
+sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size,
+ bool make_holes,
+ char const *src_name, char const *dst_name,
+ uintmax_t max_n_read, bool *last_write_made_hole)
+{
+ typedef uintptr_t word;
+ *last_write_made_hole = false;
+
+ while (max_n_read)
+ {
+ word *wp = NULL;
+
+ ssize_t n_read = read (src_fd, buf, MIN (max_n_read, buf_size));
+ if (n_read < 0)
+ {
+#ifdef EINTR
+ if (errno == EINTR)
+ continue;
+#endif
+ error (0, errno, _("reading %s"), quote (src_name));
+ return false;
+ }
+ if (n_read == 0)
+ break;
+ max_n_read -= n_read;
+
+ if (make_holes)
+ {
+ char *cp;
+
+ /* Sentinel to stop loop. */
+ buf[n_read] = '\1';
+#ifdef lint
+ /* Usually, buf[n_read] is not the byte just before a "word"
+ (aka uintptr_t) boundary. In that case, the word-oriented
+ test below (*wp++ == 0) would read some uninitialized bytes
+ after the sentinel. To avoid false-positive reports about
+ this condition (e.g., from a tool like valgrind), set the
+ remaining bytes -- to any value. */
+ memset (buf + n_read + 1, 0, sizeof (word) - 1);
+#endif
+
+ /* Find first nonzero *word*, or the word with the sentinel. */
+
+ wp = (word *) buf;
+ while (*wp++ == 0)
+ continue;
+
+ /* Find the first nonzero *byte*, or the sentinel. */
+
+ cp = (char *) (wp - 1);
+ while (*cp++ == 0)
+ continue;
+
+ if (cp <= buf + n_read)
+ /* Clear to indicate that a normal write is needed. */
+ wp = NULL;
+ else
+ {
+ /* We found the sentinel, so the whole input block was zero.
+ Make a hole. */
+ if (lseek (dest_fd, n_read, SEEK_CUR) < 0)
+ {
+ error (0, errno, _("cannot lseek %s"), quote (dst_name));
+ return false;
+ }
+ *last_write_made_hole = true;
+ }
+ }
+
+ if (!wp)
+ {
+ size_t n = n_read;
+ if (full_write (dest_fd, buf, n) != n)
+ {
+ error (0, errno, _("writing %s"), quote (dst_name));
+ return false;
+ }
+ *last_write_made_hole = false;
+
+ /* It is tempting to return early here upon a short read from a
+ regular file. That would save the final read syscall for each
+ file. Unfortunately that doesn't work for certain files in
+ /proc with linux kernels from at least 2.6.9 .. 2.6.29. */
+ }
+ }
+
+ return true;
+}
+
+/* If the file ends with a `hole' (i.e., if sparse_copy set wrote_hole_at_eof),
+ call this function to record the length of the output file. */
+static bool
+sparse_copy_finalize (int dest_fd, char const *dst_name)
+{
+ off_t len = lseek (dest_fd, 0, SEEK_CUR);
+ if (0 <= len && ftruncate (dest_fd, len) < 0)
+ {
+ error (0, errno, _("truncating %s"), quote (dst_name));
+ return false;
+ }
+
+ return true;
+}
+
/* Perform the O(1) btrfs clone operation, if possible.
Upon success, return 0. Otherwise, return -1 and set errno. */
static inline int
@@ -148,6 +269,154 @@ clone_file (int dest_fd, int src_fd)
#endif
}
+/* Write N_BYTES zero bytes to file descriptor FD. Return true if successful.
+ Upon write failure, set errno and return false. */
+static bool
+write_zeros (int fd, uint64_t n_bytes)
+{
+ static char *zeros;
+ static size_t nz = IO_BUFSIZE;
+
+ /* Attempt to use a relatively large calloc'd source buffer for
+ efficiency, but if that allocation fails, resort to a smaller
+ statically allocated one. */
+ if (zeros == NULL)
+ {
+ static char fallback[1024];
+ zeros = calloc (nz, 1);
+ if (zeros == NULL)
+ {
+ zeros = fallback;
+ nz = sizeof fallback;
+ }
+ }
+
+ while (n_bytes)
+ {
+ uint64_t n = MIN (sizeof nz, n_bytes);
+ if ((full_write (fd, zeros, n)) != n)
+ return false;
+ n_bytes -= n;
+ }
+
+ return true;
+}
+
+/* Perform an efficient extent copy, if possible. This avoids
+ the overhead of detecting holes in hole-introducing/preserving
+ copy, and thus makes copying sparse files much more efficient.
+ Upon a successful copy, return true. If the initial extent scan
+ fails, set *NORMAL_COPY_REQUIRED to true and return false.
+ Upon any other failure, set *NORMAL_COPY_REQUIRED to false and
+ return false. */
+static bool
+extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size,
+ off_t src_total_size, bool make_holes,
+ char const *src_name, char const *dst_name,
+ bool *require_normal_copy)
+{
+ struct extent_scan scan;
+ off_t last_ext_start = 0;
+ uint64_t last_ext_len = 0;
+
+ extent_scan_init (src_fd, &scan);
+
+ bool wrote_hole_at_eof = true;
+ do
+ {
+ bool ok = extent_scan_read (&scan);
+ if (! ok)
+ {
+ if (scan.hit_final_extent)
+ break;
+
+ if (scan.initial_scan_failed)
+ {
+ *require_normal_copy = true;
+ return false;
+ }
+
+ error (0, errno, _("%s: failed to get extents info"),
+ quote (src_name));
+ return false;
+ }
+
+ unsigned int i;
+ for (i = 0; i < scan.ei_count; i++)
+ {
+ off_t ext_start = scan.ext_info[i].ext_logical;
+ uint64_t ext_len = scan.ext_info[i].ext_length;
+
+ if (lseek (src_fd, ext_start, SEEK_SET) < 0)
+ {
+ error (0, errno, _("cannot lseek %s"), quote (src_name));
+ fail:
+ extent_scan_free (&scan);
+ return false;
+ }
+
+ if (make_holes)
+ {
+ if (lseek (dest_fd, ext_start, SEEK_SET) < 0)
+ {
+ error (0, errno, _("cannot lseek %s"), quote (dst_name));
+ goto fail;
+ }
+ }
+ else
+ {
+ /* When not inducing holes and when there is a hole between
+ the end of the previous extent and the beginning of the
+ current one, write zeros to the destination file. */
+ if (last_ext_start + last_ext_len < ext_start)
+ {
+ uint64_t hole_size = (ext_start
+ - last_ext_start
+ - last_ext_len);
+ if (! write_zeros (dest_fd, hole_size))
+ {
+ error (0, errno, _("%s: write failed"), quote (dst_name));
+ goto fail;
+ }
+ }
+ }
+
+ last_ext_start = ext_start;
+ last_ext_len = ext_len;
+
+ if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size,
+ make_holes, src_name, dst_name, ext_len,
+ &wrote_hole_at_eof))
+ return false;
+ }
+
+ /* Release the space allocated to scan->ext_info. */
+ extent_scan_free (&scan);
+
+ }
+ while (! scan.hit_final_extent);
+
+ /* When the source file ends with a hole, we have to do a little more work,
+ since the above copied only up to and including the final extent.
+ In order to complete the copy, we may have to insert a hole or write
+ zeros in the destination corresponding to the source file's hole-at-EOF.
+
+ In addition, if the final extent was a block of zeros at EOF and we've
+ just converted them to a hole in the destination, we must call ftruncate
+ here in order to record the proper length in the destination. */
+ off_t dest_len = lseek (dest_fd, 0, SEEK_CUR);
+ if ((dest_len < src_total_size || wrote_hole_at_eof)
+ && (make_holes
+ ? ftruncate (dest_fd, src_total_size)
+ : ! write_zeros (dest_fd, src_total_size - dest_len)))
+ {
+ error (0, errno, _("failed to extend %s"), quote (dst_name));
+ return false;
+ }
+
+ return true;
+}
+
/* FIXME: describe */
/* FIXME: rewrite this to use a hash table so we avoid the quadratic
performance hit that's probably noticeable only on trees deeper
@@ -647,7 +916,6 @@ copy_reg (char const *src_name, char const *dst_name,
if (data_copy_required)
{
typedef uintptr_t word;
- off_t n_read_total = 0;
/* Choose a suitable buffer size; it may be adjusted later. */
size_t buf_alignment = lcm (getpagesize (), sizeof (word));
@@ -655,7 +923,6 @@ copy_reg (char const *src_name, char const *dst_name,
size_t buf_size = io_blksize (sb);
/* Deal with sparse files. */
- bool last_write_made_hole = false;
bool make_holes = false;
if (S_ISREG (sb.st_mode))
@@ -704,106 +971,35 @@ copy_reg (char const *src_name, char const *dst_name,
buf_alloc = xmalloc (buf_size + buf_alignment_slop);
buf = ptr_align (buf_alloc, buf_alignment);
- while (true)
+ bool normal_copy_required;
+ /* Perform an efficient extent-based copy, falling back to the
+ standard copy only if the initial extent scan fails. If the
+ '--sparse=never' option is specified, write all data but use
+ any extents to read more efficiently. */
+ if (extent_copy (source_desc, dest_desc, buf, buf_size,
+ src_open_sb.st_size, make_holes,
+ src_name, dst_name, &normal_copy_required))
+ goto preserve_metadata;
+
+ if (! normal_copy_required)
{
- word *wp = NULL;
-
- ssize_t n_read = read (source_desc, buf, buf_size);
- if (n_read < 0)
- {
-#ifdef EINTR
- if (errno == EINTR)
- continue;
-#endif
- error (0, errno, _("reading %s"), quote (src_name));
- return_val = false;
- goto close_src_and_dst_desc;
- }
- if (n_read == 0)
- break;
-
- n_read_total += n_read;
-
- if (make_holes)
- {
- char *cp;
-
- /* Sentinel to stop loop. */
- buf[n_read] = '\1';
-#ifdef lint
- /* Usually, buf[n_read] is not the byte just before a "word"
- (aka uintptr_t) boundary. In that case, the word-oriented
- test below (*wp++ == 0) would read some uninitialized bytes
- after the sentinel. To avoid false-positive reports about
- this condition (e.g., from a tool like valgrind), set the
- remaining bytes -- to any value. */
- memset (buf + n_read + 1, 0, sizeof (word) - 1);
-#endif
-
- /* Find first nonzero *word*, or the word with the sentinel. */
-
- wp = (word *) buf;
- while (*wp++ == 0)
- continue;
-
- /* Find the first nonzero *byte*, or the sentinel. */
-
- cp = (char *) (wp - 1);
- while (*cp++ == 0)
- continue;
-
- if (cp <= buf + n_read)
- /* Clear to indicate that a normal write is needed. */
- wp = NULL;
- else
- {
- /* We found the sentinel, so the whole input block was zero.
- Make a hole. */
- if (lseek (dest_desc, n_read, SEEK_CUR) < 0)
- {
- error (0, errno, _("cannot lseek %s"), quote (dst_name));
- return_val = false;
- goto close_src_and_dst_desc;
- }
- last_write_made_hole = true;
- }
- }
-
- if (!wp)
- {
- size_t n = n_read;
- if (full_write (dest_desc, buf, n) != n)
- {
- error (0, errno, _("writing %s"), quote (dst_name));
- return_val = false;
- goto close_src_and_dst_desc;
- }
- last_write_made_hole = false;
-
- /* It is tempting to return early here upon a short read from a
- regular file. That would save the final read syscall for each
- file. Unfortunately that doesn't work for certain files in
- /proc with linux kernels from at least 2.6.9 .. 2.6.29. */
- }
+ return_val = false;
+ goto close_src_and_dst_desc;
}
- /* If the file ends with a `hole', we need to do something to record
- the length of the file. On modern systems, calling ftruncate does
- the job. On systems without native ftruncate support, we have to
- write a byte at the ending position. Otherwise the kernel would
- truncate the file at the end of the last write operation. */
-
- if (last_write_made_hole)
+ bool wrote_hole_at_eof;
+ if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size,
+ make_holes, src_name, dst_name, UINTMAX_MAX,
+ &wrote_hole_at_eof)
+ || (wrote_hole_at_eof &&
+ ! sparse_copy_finalize (dest_desc, dst_name)))
{
- if (ftruncate (dest_desc, n_read_total) < 0)
- {
- error (0, errno, _("truncating %s"), quote (dst_name));
- return_val = false;
- goto close_src_and_dst_desc;
- }
+ return_val = false;
+ goto close_src_and_dst_desc;
}
}
+preserve_metadata:
if (x->preserve_timestamps)
{
struct timespec timespec[2];
diff --git a/src/extent-scan.c b/src/extent-scan.c
new file mode 100644
index 0000000000..3bb0d536ce
--- /dev/null
+++ b/src/extent-scan.c
@@ -0,0 +1,116 @@
+/* extent-scan.c -- core functions for scanning extents
+ Copyright (C) 2010 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+ Written by Jie Liu (jeff.liu@oracle.com). */
+
+#include
+#include
+#include
+#include
+#include
+
+#include "system.h"
+#include "extent-scan.h"
+
+#ifndef HAVE_FIEMAP
+# include "fiemap.h"
+#endif
+
+/* Allocate space for struct extent_scan, initialize the entries if
+ necessary and return it as the input argument of extent_scan_read(). */
+extern void
+extent_scan_init (int src_fd, struct extent_scan *scan)
+{
+ scan->fd = src_fd;
+ scan->ei_count = 0;
+ scan->scan_start = 0;
+ scan->initial_scan_failed = false;
+ scan->hit_final_extent = false;
+}
+
+#ifdef __linux__
+# ifndef FS_IOC_FIEMAP
+# define FS_IOC_FIEMAP _IOWR ('f', 11, struct fiemap)
+# endif
+/* Call ioctl(2) with FS_IOC_FIEMAP (available in linux 2.6.27) to
+ obtain a map of file extents excluding holes. */
+extern bool
+extent_scan_read (struct extent_scan *scan)
+{
+ union { struct fiemap f; char c[4096]; } fiemap_buf;
+ struct fiemap *fiemap = &fiemap_buf.f;
+ struct fiemap_extent *fm_extents = &fiemap->fm_extents[0];
+ enum { count = (sizeof fiemap_buf - sizeof *fiemap) / sizeof *fm_extents };
+ verify (count != 0);
+
+ /* This is required at least to initialize fiemap->fm_start,
+ but also serves (in mid 2010) to appease valgrind, which
+ appears not to know the semantics of the FIEMAP ioctl. */
+ memset (&fiemap_buf, 0, sizeof fiemap_buf);
+
+ fiemap->fm_start = scan->scan_start;
+ fiemap->fm_flags = FIEMAP_FLAG_SYNC;
+ fiemap->fm_extent_count = count;
+ fiemap->fm_length = FIEMAP_MAX_OFFSET - scan->scan_start;
+
+ /* Fall back to the standard copy if call ioctl(2) failed for the
+ the first time. */
+ if (ioctl (scan->fd, FS_IOC_FIEMAP, fiemap) < 0)
+ {
+ if (scan->scan_start == 0)
+ scan->initial_scan_failed = true;
+ return false;
+ }
+
+ /* If 0 extents are returned, then more get_extent_table() are not needed. */
+ if (fiemap->fm_mapped_extents == 0)
+ {
+ scan->hit_final_extent = true;
+ return false;
+ }
+
+ scan->ei_count = fiemap->fm_mapped_extents;
+ scan->ext_info = xnmalloc (scan->ei_count, sizeof (struct extent_info));
+
+ unsigned int i;
+ for (i = 0; i < scan->ei_count; i++)
+ {
+ assert (fm_extents[i].fe_logical <= OFF_T_MAX);
+
+ scan->ext_info[i].ext_logical = fm_extents[i].fe_logical;
+ scan->ext_info[i].ext_length = fm_extents[i].fe_length;
+ scan->ext_info[i].ext_flags = fm_extents[i].fe_flags;
+ }
+
+ i--;
+ if (scan->ext_info[i].ext_flags & FIEMAP_EXTENT_LAST)
+ {
+ scan->hit_final_extent = true;
+ return true;
+ }
+
+ scan->scan_start = fm_extents[i].fe_logical + fm_extents[i].fe_length;
+
+ return true;
+}
+#else
+extern bool
+extent_scan_read (struct extent_scan *scan ATTRIBUTE_UNUSED)
+{
+ errno = ENOTSUP;
+ return false;
+}
+#endif
diff --git a/src/extent-scan.h b/src/extent-scan.h
new file mode 100644
index 0000000000..ac9e5006fc
--- /dev/null
+++ b/src/extent-scan.h
@@ -0,0 +1,68 @@
+/* core functions for efficient reading sparse files
+ Copyright (C) 2010 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+ Written by Jie Liu (jeff.liu@oracle.com). */
+
+#ifndef EXTENT_SCAN_H
+# define EXTENT_SCAN_H
+
+/* Structure used to store information of each extent. */
+struct extent_info
+{
+ /* Logical offset of an extent. */
+ off_t ext_logical;
+
+ /* Extent length. */
+ uint64_t ext_length;
+
+ /* Extent flags, use it for FIEMAP only, or set it to zero. */
+ uint32_t ext_flags;
+};
+
+/* Structure used to reserve extent scan information per file. */
+struct extent_scan
+{
+ /* File descriptor of extent scan run against. */
+ int fd;
+
+ /* Next scan start offset. */
+ off_t scan_start;
+
+ /* How many extent info returned for a scan. */
+ uint32_t ei_count;
+
+ /* If true, fall back to a normal copy, either set by the
+ failure of ioctl(2) for FIEMAP or lseek(2) with SEEK_DATA. */
+ bool initial_scan_failed;
+
+ /* If true, the total extent scan per file has been finished. */
+ bool hit_final_extent;
+
+ /* Extent information: a malloc'd array of ei_count structs. */
+ struct extent_info *ext_info;
+};
+
+void extent_scan_init (int src_fd, struct extent_scan *scan);
+
+bool extent_scan_read (struct extent_scan *scan);
+
+static inline void
+extent_scan_free (struct extent_scan *scan)
+{
+ free (scan->ext_info);
+}
+
+#endif /* EXTENT_SCAN_H */
diff --git a/src/fiemap.h b/src/fiemap.h
new file mode 100644
index 0000000000..c5d8424b3b
--- /dev/null
+++ b/src/fiemap.h
@@ -0,0 +1,102 @@
+/* FS_IOC_FIEMAP ioctl infrastructure.
+ Some portions copyright (C) 2007 Cluster File Systems, Inc
+ Authors: Mark Fasheh
+ Kalpak Shah
+ Andreas Dilger . */
+
+/* Copy from kernel, modified to respect GNU code style by Jie Liu. */
+
+#ifndef _LINUX_FIEMAP_H
+# define _LINUX_FIEMAP_H
+
+# include
+
+struct fiemap_extent
+{
+ /* Logical offset in bytes for the start of the extent
+ from the beginning of the file. */
+ uint64_t fe_logical;
+
+ /* Physical offset in bytes for the start of the extent
+ from the beginning of the disk. */
+ uint64_t fe_physical;
+
+ /* Length in bytes for this extent. */
+ uint64_t fe_length;
+
+ uint64_t fe_reserved64[2];
+
+ /* FIEMAP_EXTENT_* flags for this extent. */
+ uint32_t fe_flags;
+
+ uint32_t fe_reserved[3];
+};
+
+struct fiemap
+{
+ /* Logical offset(inclusive) at which to start mapping(in). */
+ uint64_t fm_start;
+
+ /* Logical length of mapping which userspace wants(in). */
+ uint64_t fm_length;
+
+ /* FIEMAP_FLAG_* flags for request(in/out). */
+ uint32_t fm_flags;
+
+ /* Number of extents that were mapped(out). */
+ uint32_t fm_mapped_extents;
+
+ /* Size of fm_extents array(in). */
+ uint32_t fm_extent_count;
+
+ uint32_t fm_reserved;
+
+ /* Array of mapped extents(out). */
+ struct fiemap_extent fm_extents[0];
+};
+
+/* The maximum offset can be mapped for a file. */
+# define FIEMAP_MAX_OFFSET (~0ULL)
+
+/* Sync file data before map. */
+# define FIEMAP_FLAG_SYNC 0x00000001
+
+/* Map extented attribute tree. */
+# define FIEMAP_FLAG_XATTR 0x00000002
+
+# define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
+
+/* Last extent in file. */
+# define FIEMAP_EXTENT_LAST 0x00000001
+
+/* Data location unknown. */
+# define FIEMAP_EXTENT_UNKNOWN 0x00000002
+
+/* Location still pending, Sets EXTENT_UNKNOWN. */
+# define FIEMAP_EXTENT_DELALLOC 0x00000004
+
+/* Data can not be read while fs is unmounted. */
+# define FIEMAP_EXTENT_ENCODED 0x00000008
+
+/* Data is encrypted by fs. Sets EXTENT_NO_BYPASS. */
+# define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080
+
+/* Extent offsets may not be block aligned. */
+# define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100
+
+/* Data mixed with metadata. Sets EXTENT_NOT_ALIGNED. */
+# define FIEMAP_EXTENT_DATA_INLINE 0x00000200
+
+/* Multiple files in block. Set EXTENT_NOT_ALIGNED. */
+# define FIEMAP_EXTENT_DATA_TAIL 0x00000400
+
+/* Space allocated, but not data (i.e. zero). */
+# define FIEMAP_EXTENT_UNWRITTEN 0x00000800
+
+/* File does not natively support extents. Result merged for efficiency. */
+# define FIEMAP_EXTENT_MERGED 0x00001000
+
+/* Space shared with other files. */
+# define FIEMAP_EXTENT_SHARED 0x00002000
+
+#endif
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 1e4e3009f7..40d35ac7d8 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -10,6 +10,7 @@ EXTRA_DIST = \
CuTmpdir.pm \
check.mk \
envvar-check \
+ filefrag-extent-compare \
init.cfg \
init.sh \
lang-default \
@@ -25,6 +26,7 @@ root_tests = \
cp/special-bits \
cp/cp-mv-enotsup-xattr \
cp/capability \
+ cp/sparse-fiemap \
dd/skip-seek-past-dev \
install/install-C-root \
ls/capability \
@@ -318,6 +320,8 @@ TESTS = \
cp/dir-vs-file \
cp/existing-perm-race \
cp/fail-perm \
+ cp/fiemap-perf \
+ cp/fiemap-2 \
cp/file-perm-race \
cp/into-self \
cp/link \
diff --git a/tests/cp/fiemap-2 b/tests/cp/fiemap-2
new file mode 100755
index 0000000000..d40505b704
--- /dev/null
+++ b/tests/cp/fiemap-2
@@ -0,0 +1,54 @@
+#!/bin/sh
+# Exercise a few more corners of the fiemap-copying code.
+
+# Copyright (C) 2011 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+print_ver_ cp
+
+# Require a fiemap-enabled FS.
+df -T -t btrfs -t xfs -t ext4 -t ocfs2 . \
+ || skip_ "this file system lacks FIEMAP support"
+
+# Exercise the code that handles a file ending in a hole.
+printf x > k || framework_failure_
+dd bs=1k seek=128 of=k < /dev/null || framework_failure_
+
+# The first time through the outer loop, the input file, K, ends with a hole.
+# The second time through, we append a byte so that it does not.
+for append in no yes; do
+ test $append = yes && printf y >> k
+ for i in always never; do
+ cp --sparse=$i k k2 || fail=1
+ cmp k k2 || fail=1
+ done
+done
+
+# Ensure that --sparse=always can restore holes.
+rm -f k
+# Create a file starting with an "x", followed by 257K-1 0 bytes.
+printf x > k || framework_failure_
+dd bs=1k seek=1 of=k count=255 < /dev/zero || framework_failure_
+
+# cp should detect the all-zero blocks and convert some of them to holes.
+# How many it detects/converts currently depends on io_blksize.
+# Currently, on my F14/ext4 desktop, this K starts off with size 256KiB,
+# (note that the K in the preceding test starts off with size 4KiB).
+# cp from coreutils-8.9 with --sparse=always reduces the size to 32KiB.
+cp --sparse=always k k2 || fail=1
+test $(stat -c %b k2) -lt $(stat -c %b k) || fail=1
+
+Exit $fail
diff --git a/tests/cp/fiemap-perf b/tests/cp/fiemap-perf
new file mode 100755
index 0000000000..429e59beb1
--- /dev/null
+++ b/tests/cp/fiemap-perf
@@ -0,0 +1,32 @@
+#!/bin/sh
+# ensure that a sparse file is copied efficiently, by default
+
+# Copyright (C) 2011 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+print_ver_ cp
+
+# Require a fiemap-enabled FS.
+df -T -t btrfs -t xfs -t ext4 -t ocfs2 . \
+ || skip_ "this file system lacks FIEMAP support"
+
+# Create a large-but-sparse file.
+timeout 1 dd bs=1 seek=1T of=f < /dev/null || framework_failure_
+
+# Nothing can read (much less write) that many bytes in so little time.
+timeout 3 cp f f2 || framework_failure_
+
+Exit $fail
diff --git a/tests/cp/sparse-fiemap b/tests/cp/sparse-fiemap
new file mode 100755
index 0000000000..b6b1103909
--- /dev/null
+++ b/tests/cp/sparse-fiemap
@@ -0,0 +1,119 @@
+#!/bin/sh
+# Test cp --sparse=always through fiemap copy
+
+# Copyright (C) 2010 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+if test "$VERBOSE" = yes; then
+ set -x
+ cp --version
+fi
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+if df -T -t btrfs -t xfs -t ext4 -t ocfs2 . ; then
+ : # Current dir is on a partition with working extents. Good!
+else
+ # It's not; we need to create one, hence we need root access.
+ require_root_
+
+ cwd=$PWD
+ cleanup_() { cd /; umount "$cwd/mnt"; }
+
+ skip=0
+ # Create an ext4 loopback file system
+ dd if=/dev/zero of=blob bs=32k count=1000 || skip=1
+ mkdir mnt
+ mkfs -t ext4 -F blob ||
+ skip_test_ "failed to create ext4 file system"
+ mount -oloop blob mnt || skip=1
+ cd mnt || skip=1
+ echo test > f || skip=1
+ test -s f || skip=1
+
+ test $skip = 1 &&
+ skip_test_ "insufficient mount/ext4 support"
+fi
+
+# Create a 1TiB sparse file
+dd if=/dev/zero of=sparse bs=1k count=1 seek=1G || framework_failure
+
+# It takes many minutes to copy this sparse file using the old method.
+# By contrast, it takes far less than 1 second using FIEMAP-copy.
+timeout 10 cp --sparse=always sparse fiemap || fail=1
+
+# Ensure that the sparse file copied through fiemap has the same size
+# in bytes as the original.
+test $(stat --printf %s sparse) = $(stat --printf %s fiemap) || fail=1
+
+# =================================================
+# Ensure that we exercise the FIEMAP-copying code enough
+# to provoke at least two iterations of the do...while loop
+# in which it calls ioctl (fd, FS_IOC_FIEMAP,...
+# This also verifies that non-trivial extents are preserved.
+
+$PERL -e 1 || skip_test_ 'skipping part of this test; you lack perl'
+
+# Extract logical block number and length pairs from filefrag -v output.
+# The initial sed is to remove the "eof" from the normally-empty "flags" field.
+# Similarly, remove flags values like "unknown,delalloc,eof".
+# That is required when that final extent has no number in the "expected" field.
+f()
+{
+ sed 's/ [a-z,][a-z,]*$//' $@ \
+ | awk '/^ *[0-9]/ {printf "%d %d ", $2 ,NF < 5 ? $NF : $5 } END {print ""}'
+}
+
+for i in $(seq 1 2 21); do
+ for j in 1 2 31 100; do
+ $PERL -e 'BEGIN { $n = '$i' * 1024; *F = *STDOUT }' \
+ -e 'for (1..'$j') { sysseek (*F, $n, 1)' \
+ -e '&& syswrite (*F, chr($_)x$n) or die "$!"}' > j1 || fail=1
+ # sync
+ cp --sparse=always j1 j2 || fail=1
+ # sync
+ # Technically we may need the 'sync' uses above, but
+ # uncommenting them makes this test take much longer.
+
+ cmp j1 j2 || fail=1
+ filefrag -v j1 | grep extent \
+ || skip_test_ 'skipping part of this test; you lack filefrag'
+
+ # Here is sample filefrag output:
+ # $ perl -e 'BEGIN{$n=16*1024; *F=*STDOUT}' \
+ # -e 'for (1..5) { sysseek(*F,$n,1)' \
+ # -e '&& syswrite *F,"."x$n or die "$!"}' > j
+ # $ filefrag -v j
+ # File system type is: ef53
+ # File size of j is 163840 (40 blocks, blocksize 4096)
+ # ext logical physical expected length flags
+ # 0 4 6258884 4
+ # 1 12 6258892 6258887 4
+ # 2 20 6258900 6258895 4
+ # 3 28 6258908 6258903 4
+ # 4 36 6258916 6258911 4 eof
+ # j: 6 extents found
+
+ # exclude the physical block numbers; they always differ
+ filefrag -v j1 > ff1 || fail=1
+ filefrag -v j2 > ff2 || fail=1
+ { f ff1; f ff2; } \
+ | $PERL $abs_top_srcdir/tests/filefrag-extent-compare \
+ || { fail=1; break; }
+ done
+ test $fail = 1 && break
+done
+
+Exit $fail
diff --git a/tests/filefrag-extent-compare b/tests/filefrag-extent-compare
new file mode 100644
index 0000000000..3c095d52f4
--- /dev/null
+++ b/tests/filefrag-extent-compare
@@ -0,0 +1,68 @@
+eval '(exit $?0)' && eval 'exec perl -wS "$0" ${1+"$@"}'
+ & eval 'exec perl -wS "$0" $argv:q'
+ if 0;
+# Determine whether two files have the same extents by comparing
+# the logical block numbers and lengths from filefrag -v for each.
+
+# Invoke like this:
+# This helper function, f, extracts logical block number and lengths.
+# f() { awk '/^ *[0-9]/ {printf "%d %d ",$2,NF<5?$NF:$5} END {print ""}'; }
+# { filefrag -v j1 | f; filefrag -v j2 | f; } | ./filefrag-extent-compare
+
+use warnings;
+use strict;
+(my $ME = $0) =~ s|.*/||;
+
+my @line = <>;
+my $n_lines = @line;
+$n_lines == 2
+ or die "$ME: expected exactly two input lines; got $n_lines\n";
+
+my @A = split ' ', $line[0];
+my @B = split ' ', $line[1];
+@A % 2 || @B % 2
+ and die "$ME: unexpected input: odd number of numbers; expected even\n";
+
+my @a;
+my @b;
+foreach my $i (0..@A/2-1) { $a[$i] = { L_BLK => $A[2*$i], LEN => $A[2*$i+1] } };
+foreach my $i (0..@B/2-1) { $b[$i] = { L_BLK => $B[2*$i], LEN => $B[2*$i+1] } };
+
+my $i = 0;
+my $j = 0;
+while (1)
+ {
+ !defined $a[$i] && !defined $b[$j]
+ and exit 0;
+ defined $a[$i] && defined $b[$j]
+ or die "\@a and \@b have different lengths, even after adjustment\n";
+ ($a[$i]->{L_BLK} == $b[$j]->{L_BLK}
+ && $a[$i]->{LEN} == $b[$j]->{LEN})
+ and next;
+ ($a[$i]->{LEN} < $b[$j]->{LEN}
+ && exists $a[$i+1] && $a[$i]->{LEN} + $a[$i+1]->{LEN} == $b[$j]->{LEN})
+ and ++$i, next;
+ exists $b[$j+1] && $a[$i]->{LEN} == $b[$i]->{LEN} + $b[$i+1]->{LEN}
+ and ++$j, next;
+ die "differing extent:\n"
+ . " [$i]=$a[$i]->{L_BLK} $a[$i]->{LEN}\n"
+ . " [$j]=$b[$j]->{L_BLK} $b[$j]->{LEN}\n"
+ }
+continue
+ {
+ ++$i;
+ ++$j;
+ }
+
+### Setup "GNU" style for perl-mode and cperl-mode.
+## Local Variables:
+## mode: perl
+## perl-indent-level: 2
+## perl-continued-statement-offset: 2
+## perl-continued-brace-offset: 0
+## perl-brace-offset: 0
+## perl-brace-imaginary-offset: 0
+## perl-label-offset: -2
+## perl-extra-newline-before-brace: t
+## perl-merge-trailing-else: nil
+## End: