#!/usr/bin/perl -w use strict; # $Id: gzip-links,v 1.4 2005-03-25 10:00:39-04 roderick Exp $ use Proc::SafePipe qw(backtick_noshell popen_noshell); use Proc::WaitStat qw(waitstat); use RS::Handy qw(:stat $Me data_dump getopt xdie); my @Gzip_arg_0 = ( map({ "-$_" } 1..9), qw( -N --name -n --no-name -q --quiet -v --verbose --fast --best --rsyncable ), ); # first is preferred my @Gzip_suffix = qw(.gz .z .Z .taz .tgz -gz -z _z); my $Allow_no_files = 0; my $Debug = 0; my $Exit = 0; my $Gzip_suffix_pat = undef; my $Gzip_suffix = $Gzip_suffix[0]; my $No = 0; my @Real_gzip_arg = (); my $Quiet = 0; my $Stdin0 = 0; my $Verbose = 0; my $Version = q$Revision: 1.4 $ =~ /(\d\S+)/ ? $1 : '?'; my @Option_spec = ( 'allow-no-files' => \$Allow_no_files, 'debug+' => \$Debug, 'help' => sub { usage() }, 'no' => \$No, 'stdin0' => \$Stdin0, 'version' => sub { print "$Me version $Version\n"; exit }, map({ my $full = $_; # need lexical for closure (my $plain = $full) =~ s/^-+// or die; $plain => sub { push @Real_gzip_arg, $full } } @Gzip_arg_0), ); my $Usage = < }; # strip 1 trailing null, not all, so an empty name will be an # error later $s =~ s/\0\z// or $s eq '' or xdie "--stdin0 input didn't end with null\n"; push @ARGV, split /\0/, $s, -1; } } # Group paths by device/inode. Strip out those which can't be statted. # Return a reference to %inode: # # $inode{$dev}{$inode} = [$nlinks, $path...] sub group_by_inode { my @path = @_; my (%seen, %inode); for my $p (@path) { if ($seen{$p}++) { xwarn "$p specified multiple times\n"; next; } my @st = stat $p; if (!@st) { xwarn "can't stat $p:"; next; } my $r = $inode{$st[ST_DEV]}{$st[ST_INO]} ||= [$st[ST_NLINK]]; push @$r, $p; } print data_dump "inode:", \%inode if $Debug > 1; return \%inode; } # Given \%inode, do preprocessing. # # For multiply-linked files, remove all but the first name, saving the # other names for later re-linking. # # Return references to: # $single[$i] = $singly_linked_path # $mult{$path_1} = [$path_2...] sub preprocess { my ($rall) = @_; my (@single, %mult); for my $dev (keys %$rall) { Inode: for my $inode (keys %{ $rall->{$dev} }) { my ($nlink, @p) = @{ $rall->{$dev}{$inode} }; debug "preprocess $dev/$inode nlink=$nlink names=@p"; if ($nlink != @p) { xwarn "skipping multiply-linked inode, got ", 0+@p, " of $nlink links with: @p\n"; next Inode; } # Skip inodes with compressed-looking names, as gzip would # do the same, and I don't want to remove links 2+ in that # case (especially if it's those names which are bad). if (grep { /$Gzip_suffix_pat\z/ } @p) { for (@p) { xwarn "skipping $_, ", /($Gzip_suffix_pat)\z/ ? "already has $1 suffix\n" : "other name for this inode has a" ." compressed suffix\n"; } next Inode; } # Similarly if the .gz version exists. { my @exists = grep { lstat } map { "$_$Gzip_suffix" } @p; if (@exists) { xwarn "skipping @p due to existing @exists\n"; next Inode; } } if (@p > 1) { my ($keep, @rm) = @p; @p = ($keep); verbose "compressing $keep for @rm"; for (@rm) { if (!$No && !unlink $_) { xwarn "error unlinking $_:"; # put it in the %mult list anyway, I can create the # $_.gz link. } } $mult{$keep} = \@rm; } push @single, $p[0]; } } return \@single, \%mult; } # Compress the listed files, which are all singly-linked. sub compress { my ($rsingle) = @_; print data_dump "rsingle:", $rsingle if $Debug > 1; my @cmd = (qw(xargs -0 --no-run-if-empty gzip), @Real_gzip_arg); debug "running @cmd"; my $fh = eval { popen_noshell 'w', @cmd }; if (!$fh) { chomp $@; xwarn "error running xargs/gzip: $@\n"; return; } for (@$rsingle) { debug "to-gzip $_"; print $fh "$_\0" or xwarn "error writing to xargs/gzip:"; } if (!close $fh) { xwarn "error closing xargs/gzip: ", ($!+0 ? "$!" : 'non-zero exit (' . waitstat($?) . ')'), "\n"; } } # Put links back that I removed, but with a .gz attached if appropriate. sub postprocess { my ($rmult) = @_; print data_dump "rmult:", $rmult if $Debug > 1; my $suff = $Gzip_suffix; for my $base (keys %$rmult) { my @sub = @{ $rmult->{$base} }; my $have_plain = -f $base; my $have_gz = -f "$base$suff"; if ($have_gz) { if ($have_plain) { xwarn "both $base and $base$suff now exist,", " using $base$suff\n"; $have_plain = 0; } } elsif ($have_plain) { xwarn "$base didn't get compressed\n"; } else { xwarn "neither $base nor $base$suff now exists,", " skipping link restore of @sub\n"; next; } my $this_suff = $have_gz ? $suff : ''; my $base_path = "$base$this_suff"; verbose "relinking $base_path to ", join ' ', map { "$_$this_suff" } @sub; for my $sub (@sub) { my $sub_path = "$sub$this_suff"; if (!link $base_path, $sub_path) { xwarn "error linking $base_path to $sub_path:"; } } } } sub work { my @path = @_; if ($Debug) { debug "input $_" for @path; } if (!@path) { return if $Allow_no_files; usage "no files specified\n"; } my $rinode = group_by_inode @path; %$rinode or return; my ($rsingle, $rmult) = preprocess $rinode; @$rsingle or return; # After this point warn but continue after errors as possible so I # put the links back for the multiply-linked inodes which were # removed. return if $No; compress $rsingle; postprocess $rmult; } sub main { init; work @ARGV; return 0; } $Exit = main || $Exit; $Exit = 1 if $Exit && !($Exit % 256); exit $Exit; __END__ =head1 NAME gzip-links - run gzip but deal with files which have multiple (hard) links =head1 SYNOPSIS B [B<--allow-no-files>] [B<--debug>] [B<--help>] [B<--no>] [B<--stdin0>] [B<--version>] [B<-1>] [B<-2>] [B<-3>] [B<-4>] [B<-5>] [B<-6>] [B<-7>] [B<-8>] [B<-9>] [B<-N>] [B<--name>] [B<-n>] [B<--no-name>] [B<-q>] [B<--quiet>] [B<-v>] [B<--verbose>] [B<--fast>] [B<--best>] [B<--rsyncable>] [I]... file... =head1 DESCRIPTION B is like gzip but it allows you to compress multiply-linked files. It does this by removing all but one of the links, compressing the data under the remaining name, then linking to the removed names (plus .gz). =head1 OPTIONS =over 4 =item B<--allow-no-files> Don't complain if no files are given. This is particularly useful with B<--stdin0>, such as when feeding in B output when you don't know if there will be any matches. =item B<--debug> Turn debugging on. Use multiple times for more detail. =item B<--help> Show the usage message and die. =item B<--no> Go through the motions as possible, but don't change anything. =item B<--stdin0> Read null-terminated file names (such as from C) from stdin and treat them like the files specified as arguments. This is necessary because all of the names for a file have to be seen by the same B invocation, and with C this can't be guaranteed. =item B<--version> Show the version number and exit. =back =head1 OPTIONS FOR GZIP These switches are passed along to B. =over 4 =item B<-1> .. B<-9> =item B<-N>, B<--name> B<--name> doesn't make much sense in the multiply-linked case, since only 1 name (the first given on the command line) is saved. =item B<-n>, B<--no-name> =item B<-q>, B<--quiet> B<--quiet> affect B as well. =item B<-v>, B<--verbose> B<--verbose> affect B as well. =item B<--fast> =item B<--best> =item B<--rsyncable> =back =head1 EXAMPLES $ ls -liG total 3 3499474 -rw-rw-r-- 2 roderick 623 Feb 17 14:48 mult.1 3499474 -rw-rw-r-- 2 roderick 623 Feb 17 14:48 mult.2 3499476 -rw-rw-r-- 1 roderick 316 Feb 17 14:49 single $ gzip-links -v * gzip-links: compressing mult.1 for mult.2 mult.1: 40.2% -- replaced with mult.1.gz single: 36.3% -- replaced with single.gz gzip-links: relinking mult.1.gz to mult.2.gz $ ls -liG total 2 3499477 -rw-rw-r-- 2 roderick 397 Feb 17 14:48 mult.1.gz 3499477 -rw-rw-r-- 2 roderick 397 Feb 17 14:48 mult.2.gz 3499474 -rw-rw-r-- 1 roderick 226 Feb 17 14:49 single.gz $ _ =head1 AVAILABILITY The code is licensed under the GNU GPL. Check http://www.argon.org/~roderick/ for updated versions. =head1 AUTHOR Roderick Schertler =cut