[devel] rpmlndup
Igor Vlasenko
=?iso-8859-1?q?vlasenko_=CE=C1_imath=2Ekiev=2Eua?=
Сб Фев 21 00:19:13 MSK 2009
Раз уже пошла тема о скриптах,
поделюсь скриптом rpmlndup.
=head1 NAME
rpmlndup - a tool that reduces rpm repositories size by hardlinking identical rpms.
identical = name, real size and sha1/md5 sig are the same.
когда я его у себя запустил, то винт похудел на 70Гб,
и несмотря на то, что я пользуюсь --link-dest.
--link-dest ко всем dest не напишешь.
Если будет интерес, напишу help и выложу в Сизиф.
--
Dr. Igor Vlasenko
--------------------
Topology Department
Institute of Math
Kiev, Ukraine
----------- следующая часть -----------
#!/usr/bin/perl -w
use strict;
use warnings;
use File::Find;
use RPM::Header;
use Getopt::Long;
my $verbose=1;
my $skipnosum=0;
my $result = GetOptions (
'quiet'=> sub {$verbose=0},
"skip-no-sum" => \$skipnosum,
"verbose+" => \$verbose,
);
my @directories = @ARGV;
map {-d $_ or die "argument is not a directory: $_\n"} @directories;
# first step is just a usual find; to find dup names
my %rpmbyname;
find(\&wanted, @directories);
sub wanted {
# $File::Find::dir = /some/path/
# $_ = foo.ext
# $File::Find::name = /some/path/foo.ext
my $name=$_;
return unless /\.rpm$/ and not -l $_;
$rpmbyname{$name}=[] unless defined $rpmbyname{$name};
my @stat=stat $name;
# 0 dev device number of filesystem
# 1 ino inode number
# 2 mode file mode (type and permissions)
# 3 nlink number of (hard) links to the file
# 4 uid numeric user ID of file's owner
# 5 gid numeric group ID of file's owner
# 6 rdev the device identifier (special files only)
# 7 size total size of file, in bytes
# 8 atime last access time in seconds since the epoch
# 9 mtime last modify time in seconds since the epoch
# 10 ctime inode change time in seconds since the epoch (*)
# 11 blksize preferred block size for file system I/O
# 12 blocks actual number of blocks allocated
my $size = $stat[7];
push @{$rpmbyname{$name}}, {
NAME=> $name,
# DIR => $File::Find::dir,
PATH=> $File::Find::name,
INODE => $stat[1],
SIZE => $stat[7],
};
}
# second step is to find genuine dups; the same size and sha1/md5sum.
my %rpmbysum;
while (my ($rpm, $lptr)=each %rpmbyname) {
next if $#{$lptr}<1;
my %inodes;
map {$inodes{$_->{INODE}}=1} @$lptr;
next if scalar keys(%inodes) < 2;
map {&bysum($_)} @$lptr;
}
undef %rpmbyname;
my $dupcount=0;
my $economy=0;
my @rpmtolink;
while (my ($rpm, $lptr)=each %rpmbysum) {
next if $#{$lptr}<1;
my %inodes;
map {$inodes{$_->{INODE}}=1} @$lptr;
next if scalar keys(%inodes) < 2;
my $dupnum=keys(%inodes)-1;
#print "$rpm\n";
$economy+=$lptr->[0]->{SIZE}*$dupnum;
$dupcount+=$dupnum;
push @rpmtolink, $lptr;
}
undef %rpmbysum;
print STDERR "hardlinking duplicate rpms will give print total economy:
$economy bytes in $dupcount rpms.\n";
print STDERR "Do you want to continue (y/n)?.\n";
@ARGV=();
$_=<>;
exit 0 unless (/^\s*y/i);
print "continue with ".scalar @rpmtolink." dups\n";
foreach my $lref (@rpmtolink) {
die "internal error! not enough files!" if @$lref < 2;
my $master=$lref->[0];
my $masterinode=$master->{INODE};
my $masterpath=$master->{PATH};
for (my $i=1; $i < @$lref; $i++) {
my $slave=$lref->[$i];
my $slavepath=$slave->{PATH};
#warn "already linked $masterpath $slavepath\n" if $slave->{INODE} = $masterinode;
if ($slave->{INODE} != $masterinode) {
die "impossible :(" if $slavepath eq $masterpath;
rename $slavepath, $slavepath.'.bak' || die "rename $slavepath, $slavepath.bak failed: $!";
unless (link $masterpath, $slavepath) {
warn "link $masterpath, $slavepath failed: $!";
rename $slavepath.'.bak', $slavepath;
die "execution aborted.";
}
system('touch','-acm','-r',$slavepath.'.bak','--',$slavepath);
unlink $slavepath.'.bak' || die "cleanup of $slavepath failed: $!";
print "linked successfully: $masterpath -> $slavepath\n" if $verbose;
}
}
}
sub bysum {
my $rpm=$_[0];
my $size = $rpm->{SIZE};
my $header;
eval {
$header=new RPM::Header $rpm->{PATH};
};
if ($@) {
warn "$rpm->{PATH} skipped: $@\n" if $verbose;
return;
}
my $sum = $header->{SHA1HEADER}->[0];
unless ($sum) {
warn "no sha1sum for $rpm->{NAME} - trying MD5\n" if $verbose;
$sum = $header->{SIGMD5}->[0];
unless ($sum) {
warn "no md5sum for $rpm->{NAME}\n" if $verbose;
return if $skipnosum;
# let at list declared size be the same
$sum=$header->{SIGSIZE}->[0];
$sum||=$size;
}
}
$rpm->{SUM}=$sum;
my $key=$rpm->{NAME}.'!'.$sum.'|'.$size;
$rpmbysum{$key}=[] unless defined $rpmbysum{$key};
push @{$rpmbysum{$key}}, $rpm;
}
=head1 NAME
rpmlndup - a tool that reduces rpm repositories size by hardlinking identical rpms.
=head1 SYNOPSIS
B<rpmlndup>
[B<-h|--help>]
[B<-v|--verbose>]
[B<-q|--quiet>]
[B<-y|--yes|--batch>]
[B<-a|--ask|--interactive>]
[B<-n|--no|--count>]
[B<-s|--skip-no-sum>]
[I<DIR>...]
=head1 DESCRIPTION
B<rpmlndup>
=head1 OPTIONS
=over
=item B<-h, --help>
Display this help and exit.
=item B<-v, --verbose>, B<-q, --quiet>
Verbosity level. Multiple -v increase the verbosity level, -q sets it to 0.
=item B<-y|--yes>, B<--batch>
Batch mode. links identical rpm after counting.
=item B<-n|--no>, B<--count>
No linking identical rpm, just counting space to be freed.
=item B<-a|--ask>, B<--interactive>
Interactive mode (default). Counts free space and asks to proceed with linking.
=item B<-s|--skip-no-sum>
Skip unsigned rpms (that have no sha1 or md5 sum).
=back
=head1 AUTHOR
Written by Igor Vlasenko <viy на altlinux.org>.
=head1 COPYING
Copyright (c) 2009 Igor Vlasenko, ALT Linux Team.
This is free software; you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation;
either version 2 of the License, or (at your option) any later version.
=cut
Подробная информация о списке рассылки Devel