[Comm] entities 2 chars (was: В э)
Michael Shigorin
=?iso-8859-1?q?mike_=CE=C1_osdn=2Eorg=2Eua?=
Ср Окт 12 15:33:54 MSD 2005
On Tue, Oct 11, 2005 at 10:02:52AM +0700, Andrei Lomov wrote:
> С кодировкой темы все в порядке.
> Пришло письмо с hotmail, примерно в таком виде:
> В этот
> и т.д. Как его прочитать?
Гляньте в аттач, мож пригодится (что-то из подобного уникоду
в упаковке html entities им и раскурочивал).
--
---- WBR, Michael Shigorin <mike на altlinux.ru>
------ Linux.Kiev http://www.linux.kiev.ua/
----------- следующая часть -----------
#!/usr/bin/perl -w
# char2ent.pl
#
# Simple utility to convert files with &#ddd; to/from 8bit chars
# See usage at end of this file ( or ./char2ent -h )
# PS works only with 8bit chars, not talking about UTF-16 Unicode here
#
# mode=html (default)
# Convert 8bit chars (with high bit set) to html entity &#ddd;
#
# mode=work:
# Convert html entities &#ddd; to the corresponding 8bit char
#
# Christophe Chisogne <christophe на publicityweb.com>
use Getopt::Long;
use strict;
my $PROG = 'char2ent'; # prog name to display
my $VERSION = '0.02';
my $DATE = '2003/11/07';
my $BACK = 'bak'; # extension for backup files
# vars from CLI options
my ($mode, $backup, $confirm, $keep, $version, $help);
$mode = 'html';
my $resopt = GetOptions('version|v' => \$version,
'help|h' => \$help,
'mode=s' => \$mode,
'backup|b' => \$backup,
'confirm|c' => \$confirm,
'keep|k' => \$keep,
)
or usage();
version() if defined $version;
usage() if (@ARGV != 1) || (defined $help);
my $conv;
if ($mode =~ /html/i) {
print "Conversion from 8bit chars to &#ddd; entities\n";
$conv = \&char2ent;
} elsif ($mode =~ /work/i) {
print "Conversion from &#ddd; entities to 8bit chars\n";
$conv = \&ent2char;
} else {
usage();
}
# Latin1 convert table taken (thanks awk ;-) from
# http://www.w3.org/TR/html401/sgml/entities.html
#
# Portions ╘ International Organization for Standardization 1986
# Permission to copy in any form is granted for use with
# conforming SGML systems and applications as defined in
# ISO 8879, provided this notice is included in all copies.
# warning, case sensitive for matches
my %latin1 = (
' ' => ' ',
'¡' => '¡',
'¢' => '¢',
'£' => '£',
'¤' => '¤',
'¥' => '¥',
'¦' => '¦',
'§' => '§',
'¨' => '¨',
'©' => '©',
'ª' => 'ª',
'«' => '«',
'¬' => '¬',
'­' => '­',
'®' => '®',
'¯' => '¯',
'°' => '°',
'±' => '±',
'²' => '²',
'³' => '³',
'´' => '´',
'µ' => 'µ',
'¶' => '¶',
'·' => '·',
'¸' => '¸',
'¹' => '¹',
'º' => 'º',
'»' => '»',
'¼' => '¼',
'½' => '½',
'¾' => '¾',
'¿' => '¿',
'À' => 'À',
'Á' => 'Á',
'Â' => 'Â',
'Ã' => 'Ã',
'Ä' => 'Ä',
'Å' => 'Å',
'Æ' => 'Æ',
'Ç' => 'Ç',
'È' => 'È',
'É' => 'É',
'Ê' => 'Ê',
'Ë' => 'Ë',
'Ì' => 'Ì',
'Í' => 'Í',
'Î' => 'Î',
'Ï' => 'Ï',
'Ð' => 'Ð',
'Ñ' => 'Ñ',
'Ò' => 'Ò',
'Ó' => 'Ó',
'Ô' => 'Ô',
'Õ' => 'Õ',
'Ö' => 'Ö',
'×' => '×',
'Ø' => 'Ø',
'Ù' => 'Ù',
'Ú' => 'Ú',
'Û' => 'Û',
'Ü' => 'Ü',
'Ý' => 'Ý',
'Þ' => 'Þ',
'ß' => 'ß',
'à' => 'à',
'á' => 'á',
'â' => 'â',
'ã' => 'ã',
'ä' => 'ä',
'å' => 'å',
'æ' => 'æ',
'ç' => 'ç',
'è' => 'è',
'é' => 'é',
'ê' => 'ê',
'ë' => 'ë',
'ì' => 'ì',
'í' => 'í',
'î' => 'î',
'ï' => 'ï',
'ð' => 'ð',
'ñ' => 'ñ',
'ò' => 'ò',
'ó' => 'ó',
'ô' => 'ô',
'õ' => 'õ',
'ö' => 'ö',
'÷' => '÷',
'ø' => 'ø',
'ù' => 'ù',
'ú' => 'ú',
'û' => 'û',
'ü' => 'ü',
'ý' => 'ý',
'þ' => 'þ',
'ÿ' => 'ÿ',
);
my $ok = 'y';
foreach my $filename (@ARGV) {
if (defined $confirm) {
print "Convert file [$filename]? [Yn] ";
$ok = <STDIN>;
}
unless ($ok =~ /n/i) {
print "Converting file [$filename]...\n";
convertfile($filename);
}
}
exit 0;
# convertfile($filename)
sub convertfile {
my $filename = shift;
my $tmpname = "$filename.$$";
open INFILE, $filename or die "Cant open $filename\n";
open OUTFILE, ">$tmpname" or die "Cant write $tmpname\n";
while (<INFILE>) {
print OUTFILE &$conv($_);
}
close INFILE;
close OUTFILE;
if ($backup) {
rename($filename, "$filename.$BACK")
or die "Cant backup $filename.$BACK\n";
}
rename($tmpname, $filename)
or die "Cant write $filename from $tmpname\n";
}
# $line2 = char2ent($line)
sub char2ent {
my $line = shift;
$line =~ s/(.)/(ord $1 > 127) ? '&#'.ord($1).';' : $1/ge;
$line;
}
# $line2 = ent2char($line)
sub ent2char {
my $line = shift;
# first change all é etc to &#ddd; unless told otherwise
unless (defined $keep) {
foreach my $lat_ent (keys %latin1) {
$line =~ s/$lat_ent/$latin1{$lat_ent}/ge;
}
}
# then &#ddd; to 8bit char
$line =~ s/&#(\d\d\d);/chr($1)/ge;
$line;
}
# version()
sub version {
print "$PROG v$VERSION, $DATE\n\n";
print "Convert files with 8bit chars to/from &#ddd; entities\n";
print "Can convert &name; entities from latin1 (160-255)\n";
print "\n";
usage();
exit 0;
}
# usage()
sub usage {
print <<EOF;
Usage:
$PROG [--mode=html|work] [-b] [-c] [-k] 8bitfile.txt ...
$PROG [--help] [--version]
--mode=x, -m=x choose html mode (default) or work mode
--backup, -b backup of modified file
--confirm, -c confirm conversion of each file
--keep, -k dont translate &name; entities to &#ddd;
EOF
exit 1;
}
----------- следующая часть -----------
Было удалено вложение не в текстовом формате...
Имя : =?iso-8859-1?q?=CF=D4=D3=D5=D4=D3=D4=D7=D5=C5=D4?=
Тип : application/pgp-signature
Размер : 189 байтов
Описание: =?iso-8859-1?q?=CF=D4=D3=D5=D4=D3=D4=D7=D5=C5=D4?=
Url : <http://lists.altlinux.org/pipermail/community/attachments/20051012/427b836b/attachment-0003.bin>
Подробная информация о списке рассылки community