head 1.4; access; symbols; locks; strict; comment @# @; 1.4 date 2005.03.25.14.00.39; author roderick; state Exp; branches; next 1.3; 1.3 date 2005.03.23.14.59.51; author roderick; state Exp; branches; next 1.2; 1.2 date 2005.02.17.19.56.38; author roderick; state Exp; branches; next 1.1; 1.1 date 2005.02.17.19.53.43; author roderick; state Exp; branches; next ; desc @@ 1.4 log @Add --allow-no-files. Allow --stdin0's input to be empty. @ text @#!/usr/bin/perl -w use strict; # $Id: gzip-links,v 1.3 2005-03-23 09:59:51-05 roderick Exp roderick $ use Proc::SafePipe qw(backtick_noshell popen_noshell); use Proc::WaitStat qw(waitstat); use RS::Handy qw(:stat $Me data_dump getopt xdie); my @@Gzip_arg_0 = ( map({ "-$_" } 1..9), qw( -N --name -n --no-name -q --quiet -v --verbose --fast --best --rsyncable ), ); # first is preferred my @@Gzip_suffix = qw(.gz .z .Z .taz .tgz -gz -z _z); my $Allow_no_files = 0; my $Debug = 0; my $Exit = 0; my $Gzip_suffix_pat = undef; my $Gzip_suffix = $Gzip_suffix[0]; my $No = 0; my @@Real_gzip_arg = (); my $Quiet = 0; my $Stdin0 = 0; my $Verbose = 0; my $Version = q$Revision: 1.3 $ =~ /(\d\S+)/ ? $1 : '?'; my @@Option_spec = ( 'allow-no-files' => \$Allow_no_files, 'debug+' => \$Debug, 'help' => sub { usage() }, 'no' => \$No, 'stdin0' => \$Stdin0, 'version' => sub { print "$Me version $Version\n"; exit }, map({ my $full = $_; # need lexical for closure (my $plain = $full) =~ s/^-+// or die; $plain => sub { push @@Real_gzip_arg, $full } } @@Gzip_arg_0), ); my $Usage = < }; # strip 1 trailing null, not all, so an empty name will be an # error later $s =~ s/\0\z// or $s eq '' or xdie "--stdin0 input didn't end with null\n"; push @@ARGV, split /\0/, $s, -1; } } # Group paths by device/inode. Strip out those which can't be statted. # Return a reference to %inode: # # $inode{$dev}{$inode} = [$nlinks, $path...] sub group_by_inode { my @@path = @@_; my (%seen, %inode); for my $p (@@path) { if ($seen{$p}++) { xwarn "$p specified multiple times\n"; next; } my @@st = stat $p; if (!@@st) { xwarn "can't stat $p:"; next; } my $r = $inode{$st[ST_DEV]}{$st[ST_INO]} ||= [$st[ST_NLINK]]; push @@$r, $p; } print data_dump "inode:", \%inode if $Debug > 1; return \%inode; } # Given \%inode, do preprocessing. # # For multiply-linked files, remove all but the first name, saving the # other names for later re-linking. # # Return references to: # $single[$i] = $singly_linked_path # $mult{$path_1} = [$path_2...] sub preprocess { my ($rall) = @@_; my (@@single, %mult); for my $dev (keys %$rall) { Inode: for my $inode (keys %{ $rall->{$dev} }) { my ($nlink, @@p) = @@{ $rall->{$dev}{$inode} }; debug "preprocess $dev/$inode nlink=$nlink names=@@p"; if ($nlink != @@p) { xwarn "skipping multiply-linked inode, got ", 0+@@p, " of $nlink links with: @@p\n"; next Inode; } # Skip inodes with compressed-looking names, as gzip would # do the same, and I don't want to remove links 2+ in that # case (especially if it's those names which are bad). if (grep { /$Gzip_suffix_pat\z/ } @@p) { for (@@p) { xwarn "skipping $_, ", /($Gzip_suffix_pat)\z/ ? "already has $1 suffix\n" : "other name for this inode has a" ." compressed suffix\n"; } next Inode; } # Similarly if the .gz version exists. { my @@exists = grep { lstat } map { "$_$Gzip_suffix" } @@p; if (@@exists) { xwarn "skipping @@p due to existing @@exists\n"; next Inode; } } if (@@p > 1) { my ($keep, @@rm) = @@p; @@p = ($keep); verbose "compressing $keep for @@rm"; for (@@rm) { if (!$No && !unlink $_) { xwarn "error unlinking $_:"; # put it in the %mult list anyway, I can create the # $_.gz link. } } $mult{$keep} = \@@rm; } push @@single, $p[0]; } } return \@@single, \%mult; } # Compress the listed files, which are all singly-linked. sub compress { my ($rsingle) = @@_; print data_dump "rsingle:", $rsingle if $Debug > 1; my @@cmd = (qw(xargs -0 --no-run-if-empty gzip), @@Real_gzip_arg); debug "running @@cmd"; my $fh = eval { popen_noshell 'w', @@cmd }; if (!$fh) { chomp $@@; xwarn "error running xargs/gzip: $@@\n"; return; } for (@@$rsingle) { debug "to-gzip $_"; print $fh "$_\0" or xwarn "error writing to xargs/gzip:"; } if (!close $fh) { xwarn "error closing xargs/gzip: ", ($!+0 ? "$!" : 'non-zero exit (' . waitstat($?) . ')'), "\n"; } } # Put links back that I removed, but with a .gz attached if appropriate. sub postprocess { my ($rmult) = @@_; print data_dump "rmult:", $rmult if $Debug > 1; my $suff = $Gzip_suffix; for my $base (keys %$rmult) { my @@sub = @@{ $rmult->{$base} }; my $have_plain = -f $base; my $have_gz = -f "$base$suff"; if ($have_gz) { if ($have_plain) { xwarn "both $base and $base$suff now exist,", " using $base$suff\n"; $have_plain = 0; } } elsif ($have_plain) { xwarn "$base didn't get compressed\n"; } else { xwarn "neither $base nor $base$suff now exists,", " skipping link restore of @@sub\n"; next; } my $this_suff = $have_gz ? $suff : ''; my $base_path = "$base$this_suff"; verbose "relinking $base_path to ", join ' ', map { "$_$this_suff" } @@sub; for my $sub (@@sub) { my $sub_path = "$sub$this_suff"; if (!link $base_path, $sub_path) { xwarn "error linking $base_path to $sub_path:"; } } } } sub work { my @@path = @@_; if ($Debug) { debug "input $_" for @@path; } if (!@@path) { return if $Allow_no_files; usage "no files specified\n"; } my $rinode = group_by_inode @@path; %$rinode or return; my ($rsingle, $rmult) = preprocess $rinode; @@$rsingle or return; # After this point warn but continue after errors as possible so I # put the links back for the multiply-linked inodes which were # removed. return if $No; compress $rsingle; postprocess $rmult; } sub main { init; work @@ARGV; return 0; } $Exit = main || $Exit; $Exit = 1 if $Exit && !($Exit % 256); exit $Exit; __END__ =head1 NAME gzip-links - run gzip but deal with files which have multiple (hard) links =head1 SYNOPSIS B [B<--allow-no-files>] [B<--debug>] [B<--help>] [B<--no>] [B<--stdin0>] [B<--version>] [B<-1>] [B<-2>] [B<-3>] [B<-4>] [B<-5>] [B<-6>] [B<-7>] [B<-8>] [B<-9>] [B<-N>] [B<--name>] [B<-n>] [B<--no-name>] [B<-q>] [B<--quiet>] [B<-v>] [B<--verbose>] [B<--fast>] [B<--best>] [B<--rsyncable>] [I]... file... =head1 DESCRIPTION B is like gzip but it allows you to compress multiply-linked files. It does this by removing all but one of the links, compressing the data under the remaining name, then linking to the removed names (plus .gz). =head1 OPTIONS =over 4 =item B<--allow-no-files> Don't complain if no files are given. This is particularly useful with B<--stdin0>, such as when feeding in B output when you don't know if there will be any matches. =item B<--debug> Turn debugging on. Use multiple times for more detail. =item B<--help> Show the usage message and die. =item B<--no> Go through the motions as possible, but don't change anything. =item B<--stdin0> Read null-terminated file names (such as from C) from stdin and treat them like the files specified as arguments. This is necessary because all of the names for a file have to be seen by the same B invocation, and with C this can't be guaranteed. =item B<--version> Show the version number and exit. =back =head1 OPTIONS FOR GZIP These switches are passed along to B. =over 4 =item B<-1> .. B<-9> =item B<-N>, B<--name> B<--name> doesn't make much sense in the multiply-linked case, since only 1 name (the first given on the command line) is saved. =item B<-n>, B<--no-name> =item B<-q>, B<--quiet> B<--quiet> affect B as well. =item B<-v>, B<--verbose> B<--verbose> affect B as well. =item B<--fast> =item B<--best> =item B<--rsyncable> =back =head1 EXAMPLES $ ls -liG total 3 3499474 -rw-rw-r-- 2 roderick 623 Feb 17 14:48 mult.1 3499474 -rw-rw-r-- 2 roderick 623 Feb 17 14:48 mult.2 3499476 -rw-rw-r-- 1 roderick 316 Feb 17 14:49 single $ gzip-links -v * gzip-links: compressing mult.1 for mult.2 mult.1: 40.2% -- replaced with mult.1.gz single: 36.3% -- replaced with single.gz gzip-links: relinking mult.1.gz to mult.2.gz $ ls -liG total 2 3499477 -rw-rw-r-- 2 roderick 397 Feb 17 14:48 mult.1.gz 3499477 -rw-rw-r-- 2 roderick 397 Feb 17 14:48 mult.2.gz 3499474 -rw-rw-r-- 1 roderick 226 Feb 17 14:49 single.gz $ _ =head1 AVAILABILITY The code is licensed under the GNU GPL. Check http://www.argon.org/~roderick/ for updated versions. =head1 AUTHOR Roderick Schertler =cut @ 1.3 log @Add --stdin0. @ text @d4 1 a4 1 # $Id: gzip-links,v 1.2 2005-02-17 14:56:38-05 roderick Exp roderick $ d26 1 d36 1 a36 1 my $Version = q$Revision: 1.2 $ =~ /(\d\S+)/ ? $1 : '?'; d39 1 d56 1 d106 1 d292 5 a315 1 @@ARGV or usage "no files specified\n"; d333 1 d360 6 @ 1.2 log @Oops, drop outdated XXX comment. @ text @d4 1 a4 1 # $Id: gzip-links,v 1.1 2005-02-17 14:53:43-05 roderick Exp roderick $ d33 1 d35 1 a35 1 my $Version = q$Revision: 1.1 $ =~ /(\d\S+)/ ? $1 : '?'; d41 1 d57 1 d95 12 d328 1 d363 8 @ 1.1 log @Initial revision @ text @d4 1 a4 3 # $Id$ # XXX change name? d34 1 a34 1 my $Version = q$Revision$ =~ /(\d\S+)/ ? $1 : '?'; @