[devel] rpmlndup

Сб Фев 21 00:19:13 MSK 2009

Раз уже пошла тема о скриптах,
поделюсь скриптом rpmlndup.

=head1	NAME

rpmlndup - a tool that reduces rpm repositories size by hardlinking identical rpms.

identical = name, real size and sha1/md5 sig are the same.

когда я его у себя запустил, то винт похудел на 70Гб,
и несмотря на то, что я пользуюсь --link-dest.
--link-dest ко всем dest не напишешь.

Если будет интерес, напишу help и выложу в Сизиф.

-- 

Dr. Igor Vlasenko
--------------------
Topology Department
Institute of Math
Kiev, Ukraine

----------- следующая часть -----------
#!/usr/bin/perl -w

use strict;
use warnings;
use File::Find;
use RPM::Header;
use Getopt::Long;

my $verbose=1;
my $skipnosum=0;

my $result = GetOptions (
    'quiet'=> sub {$verbose=0},
    "skip-no-sum"  => \$skipnosum,
    "verbose+"  => \$verbose,
);

my @directories = @ARGV;

map {-d $_ or die "argument is not a directory: $_\n"} @directories;

# first step is just a usual find; to find dup names
my %rpmbyname;
find(\&wanted,  @directories);
sub wanted {
# $File::Find::dir  = /some/path/
# $_                = foo.ext
# $File::Find::name = /some/path/foo.ext
    my $name=$_;
    return unless /\.rpm$/ and not -l $_;
    $rpmbyname{$name}=[] unless defined $rpmbyname{$name};
    my @stat=stat $name;
#  0 dev      device number of filesystem
#  1 ino      inode number
#  2 mode     file mode  (type and permissions)
#  3 nlink    number of (hard) links to the file
#  4 uid      numeric user ID of file's owner
#  5 gid      numeric group ID of file's owner
#  6 rdev     the device identifier (special files only)
#  7 size     total size of file, in bytes
#  8 atime    last access time in seconds since the epoch
#  9 mtime    last modify time in seconds since the epoch
# 10 ctime    inode change time in seconds since the epoch (*)
# 11 blksize  preferred block size for file system I/O
# 12 blocks   actual number of blocks allocated
    my $size = $stat[7];
    push @{$rpmbyname{$name}}, {
	NAME=> $name,
#	DIR => $File::Find::dir,
	PATH=> $File::Find::name,
	INODE => $stat[1],
	SIZE => $stat[7],
    };
}

# second step is to find genuine dups; the same size and sha1/md5sum.
my %rpmbysum;
while (my ($rpm, $lptr)=each %rpmbyname) {
    next if $#{$lptr}<1;
    my %inodes;
    map {$inodes{$_->{INODE}}=1} @$lptr;
    next if scalar keys(%inodes) < 2;
    map {&bysum($_)} @$lptr;
}

undef %rpmbyname;
my $dupcount=0;
my $economy=0;

my @rpmtolink;
while (my ($rpm, $lptr)=each %rpmbysum) {
    next if $#{$lptr}<1;
    my %inodes;
    map {$inodes{$_->{INODE}}=1} @$lptr;
    next if scalar keys(%inodes) < 2;
    my $dupnum=keys(%inodes)-1;
    #print "$rpm\n";
    $economy+=$lptr->[0]->{SIZE}*$dupnum;
    $dupcount+=$dupnum;
    push @rpmtolink, $lptr;
}
undef %rpmbysum;

print STDERR "hardlinking duplicate rpms will give print total economy:
$economy bytes in $dupcount rpms.\n";
print STDERR "Do you want to continue (y/n)?.\n";
@ARGV=();
$_=<>;
exit 0 unless (/^\s*y/i);
print "continue with ".scalar @rpmtolink." dups\n";

foreach my $lref (@rpmtolink) {
    die "internal error! not enough files!" if @$lref < 2;
    my $master=$lref->[0];
    my $masterinode=$master->{INODE};
    my $masterpath=$master->{PATH};
    for (my $i=1; $i < @$lref; $i++) {
	my $slave=$lref->[$i];
	my $slavepath=$slave->{PATH};
	#warn "already linked $masterpath $slavepath\n" if $slave->{INODE} = $masterinode;
	if ($slave->{INODE} != $masterinode) {
	    die "impossible :(" if $slavepath eq $masterpath;
	    rename $slavepath, $slavepath.'.bak' || die "rename $slavepath, $slavepath.bak failed: $!";
	    unless (link $masterpath, $slavepath) {
		warn "link $masterpath, $slavepath failed: $!";
		rename $slavepath.'.bak', $slavepath;
		die "execution aborted.";
	    }
	    system('touch','-acm','-r',$slavepath.'.bak','--',$slavepath);
	    unlink $slavepath.'.bak' || die "cleanup of $slavepath failed: $!";
	    print "linked successfully: $masterpath -> $slavepath\n" if $verbose;
	}
    }
}

sub bysum {
    my $rpm=$_[0];
    my $size = $rpm->{SIZE};
    my $header;
    eval {
	$header=new RPM::Header $rpm->{PATH};
    };
    if ($@) {
	warn "$rpm->{PATH} skipped: $@\n" if $verbose;
	return;
    }
    my $sum = $header->{SHA1HEADER}->[0];
    unless ($sum) {
	warn "no sha1sum for $rpm->{NAME} - trying MD5\n" if $verbose;
	$sum = $header->{SIGMD5}->[0];
	unless ($sum) {
	    warn "no md5sum for $rpm->{NAME}\n" if $verbose;
	    return if $skipnosum;
	    # let at list declared size be the same
	    $sum=$header->{SIGSIZE}->[0];
	    $sum||=$size;
	}
    }
    $rpm->{SUM}=$sum;
    my $key=$rpm->{NAME}.'!'.$sum.'|'.$size;
    $rpmbysum{$key}=[] unless defined $rpmbysum{$key};
    push @{$rpmbysum{$key}}, $rpm;
}

=head1	NAME

rpmlndup - a tool that reduces rpm repositories size by hardlinking identical rpms.

=head1	SYNOPSIS

B<rpmlndup>
[B<-h|--help>]
[B<-v|--verbose>]
[B<-q|--quiet>]
[B<-y|--yes|--batch>]
[B<-a|--ask|--interactive>]
[B<-n|--no|--count>]
[B<-s|--skip-no-sum>]
[I<DIR>...] 

=head1	DESCRIPTION

B<rpmlndup> 

=head1	OPTIONS

=over

=item	B<-h, --help>

Display this help and exit.

=item	B<-v, --verbose>, B<-q, --quiet>

Verbosity level. Multiple -v increase the verbosity level, -q sets it to 0.

=item	B<-y|--yes>, B<--batch>

Batch mode. links identical rpm after counting.

=item	B<-n|--no>, B<--count>

No linking identical rpm, just counting space to be freed.

=item	B<-a|--ask>, B<--interactive>

Interactive mode (default). Counts free space and asks to proceed with linking.

=item	B<-s|--skip-no-sum>

Skip unsigned rpms (that have no sha1 or md5 sum).

=back

=head1	AUTHOR

Written by Igor Vlasenko <viy на altlinux.org>.

=head1	COPYING

This is free software; you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation;
either version 2 of the License, or (at your option) any later version.

=cut