[Comm] entities 2 chars (was: В э)

Michael Shigorin =?iso-8859-1?q?mike_=CE=C1_osdn=2Eorg=2Eua?=
Ср Окт 12 15:33:54 MSD 2005


On Tue, Oct 11, 2005 at 10:02:52AM +0700, Andrei Lomov wrote:
> С кодировкой темы все в порядке.
> Пришло письмо с hotmail, примерно в таком виде:
> В этот
> и т.д.  Как его прочитать?

Гляньте в аттач, мож пригодится (что-то из подобного уникоду
в упаковке html entities им и раскурочивал).

-- 
 ---- WBR, Michael Shigorin <mike на altlinux.ru>
  ------ Linux.Kiev http://www.linux.kiev.ua/
----------- следующая часть -----------
#!/usr/bin/perl -w

# char2ent.pl
# 
# Simple utility to convert files with &#ddd; to/from 8bit chars
# See usage at end of this file ( or ./char2ent -h )
# PS works only with 8bit chars, not talking about UTF-16 Unicode here
#
# mode=html (default)
#   Convert 8bit chars (with high bit set) to html entity &#ddd;
#
# mode=work:
#   Convert html entities &#ddd; to the corresponding 8bit char
#
# Christophe Chisogne <christophe на publicityweb.com>

use Getopt::Long;
use strict;

my $PROG = 'char2ent';		# prog name to display
my $VERSION = '0.02';
my $DATE = '2003/11/07';
my $BACK = 'bak';		# extension for backup files

# vars from CLI options
my ($mode, $backup, $confirm, $keep, $version, $help); 
$mode = 'html';
my $resopt = GetOptions('version|v' => \$version,
	'help|h' => \$help,
	'mode=s' => \$mode,
	'backup|b' => \$backup,
	'confirm|c' => \$confirm,
	'keep|k' => \$keep,
	)
or usage();

version() if defined $version;
usage() if (@ARGV != 1) || (defined $help);
my $conv;
if ($mode =~ /html/i) {
	print "Conversion from 8bit chars to &#ddd; entities\n";
	$conv = \&char2ent;
} elsif ($mode =~ /work/i) {
	print "Conversion from &#ddd; entities to 8bit chars\n";
	$conv = \&ent2char;
} else {
	usage();
}

# Latin1 convert table taken (thanks awk ;-) from
# http://www.w3.org/TR/html401/sgml/entities.html
#
# Portions ╘ International Organization for Standardization 1986
# Permission to copy in any form is granted for use with
# conforming SGML systems and applications as defined in
# ISO 8879, provided this notice is included in all copies.

# warning, case sensitive for matches
my %latin1 = (
'&nbsp;' => '&#160;',
'&iexcl;' => '&#161;',
'&cent;' => '&#162;',
'&pound;' => '&#163;',
'&curren;' => '&#164;',
'&yen;' => '&#165;',
'&brvbar;' => '&#166;',
'&sect;' => '&#167;',
'&uml;' => '&#168;',
'&copy;' => '&#169;',
'&ordf;' => '&#170;',
'&laquo;' => '&#171;',
'&not;' => '&#172;',
'&shy;' => '&#173;',
'&reg;' => '&#174;',
'&macr;' => '&#175;',
'&deg;' => '&#176;',
'&plusmn;' => '&#177;',
'&sup2;' => '&#178;',
'&sup3;' => '&#179;',
'&acute;' => '&#180;',
'&micro;' => '&#181;',
'&para;' => '&#182;',
'&middot;' => '&#183;',
'&cedil;' => '&#184;',
'&sup1;' => '&#185;',
'&ordm;' => '&#186;',
'&raquo;' => '&#187;',
'&frac14;' => '&#188;',
'&frac12;' => '&#189;',
'&frac34;' => '&#190;',
'&iquest;' => '&#191;',
'&Agrave;' => '&#192;',
'&Aacute;' => '&#193;',
'&Acirc;' => '&#194;',
'&Atilde;' => '&#195;',
'&Auml;' => '&#196;',
'&Aring;' => '&#197;',
'&AElig;' => '&#198;',
'&Ccedil;' => '&#199;',
'&Egrave;' => '&#200;',
'&Eacute;' => '&#201;',
'&Ecirc;' => '&#202;',
'&Euml;' => '&#203;',
'&Igrave;' => '&#204;',
'&Iacute;' => '&#205;',
'&Icirc;' => '&#206;',
'&Iuml;' => '&#207;',
'&ETH;' => '&#208;',
'&Ntilde;' => '&#209;',
'&Ograve;' => '&#210;',
'&Oacute;' => '&#211;',
'&Ocirc;' => '&#212;',
'&Otilde;' => '&#213;',
'&Ouml;' => '&#214;',
'&times;' => '&#215;',
'&Oslash;' => '&#216;',
'&Ugrave;' => '&#217;',
'&Uacute;' => '&#218;',
'&Ucirc;' => '&#219;',
'&Uuml;' => '&#220;',
'&Yacute;' => '&#221;',
'&THORN;' => '&#222;',
'&szlig;' => '&#223;',
'&agrave;' => '&#224;',
'&aacute;' => '&#225;',
'&acirc;' => '&#226;',
'&atilde;' => '&#227;',
'&auml;' => '&#228;',
'&aring;' => '&#229;',
'&aelig;' => '&#230;',
'&ccedil;' => '&#231;',
'&egrave;' => '&#232;',
'&eacute;' => '&#233;',
'&ecirc;' => '&#234;',
'&euml;' => '&#235;',
'&igrave;' => '&#236;',
'&iacute;' => '&#237;',
'&icirc;' => '&#238;',
'&iuml;' => '&#239;',
'&eth;' => '&#240;',
'&ntilde;' => '&#241;',
'&ograve;' => '&#242;',
'&oacute;' => '&#243;',
'&ocirc;' => '&#244;',
'&otilde;' => '&#245;',
'&ouml;' => '&#246;',
'&divide;' => '&#247;',
'&oslash;' => '&#248;',
'&ugrave;' => '&#249;',
'&uacute;' => '&#250;',
'&ucirc;' => '&#251;',
'&uuml;' => '&#252;',
'&yacute;' => '&#253;',
'&thorn;' => '&#254;',
'&yuml;' => '&#255;',
);

my $ok = 'y';
foreach my $filename (@ARGV) {
	if (defined $confirm) {
		print "Convert file [$filename]? [Yn] ";
		$ok = <STDIN>;
	}
	unless ($ok =~ /n/i) {
		print "Converting file [$filename]...\n";
		convertfile($filename);
	}
}
exit 0;

# convertfile($filename)
sub convertfile {
	my $filename = shift;
	my $tmpname = "$filename.$$";
	open INFILE, $filename or die "Cant open $filename\n";
	open OUTFILE, ">$tmpname" or die "Cant write $tmpname\n";
	while (<INFILE>) {
		print OUTFILE &$conv($_);
	}
	close INFILE;
	close OUTFILE;
	if ($backup) {
		rename($filename, "$filename.$BACK") 
			or die "Cant backup $filename.$BACK\n";
	}
	rename($tmpname, $filename) 
		or die "Cant write $filename from $tmpname\n";
}

# $line2 = char2ent($line)
sub char2ent {
	my $line = shift;
	$line =~ s/(.)/(ord $1 > 127) ? '&#'.ord($1).';' : $1/ge;
	$line;
}

# $line2 = ent2char($line)
sub ent2char {
	my $line = shift;
	# first change all &eacute; etc to &#ddd; unless told otherwise
	unless (defined $keep) {
		foreach my $lat_ent (keys %latin1) {
			$line =~ s/$lat_ent/$latin1{$lat_ent}/ge;
		}
	}
	# then &#ddd; to 8bit char
	$line =~ s/&#(\d\d\d);/chr($1)/ge;
	$line;
}

# version()
sub version {
	print "$PROG v$VERSION, $DATE\n\n";
	print "Convert files with 8bit chars to/from &#ddd; entities\n";
	print "Can convert &name; entities from latin1 (160-255)\n";
	print "\n";	
	usage();
	exit 0;
}

# usage()
sub usage {
	print <<EOF;
Usage:
$PROG [--mode=html|work] [-b] [-c] [-k] 8bitfile.txt ...
$PROG [--help] [--version]

--mode=x,  -m=x   choose html mode (default) or work mode
--backup,  -b     backup of modified file
--confirm, -c     confirm conversion of each file
--keep,    -k     dont translate &name; entities to &#ddd;
EOF
	exit 1;
}

----------- следующая часть -----------
Было удалено вложение не в текстовом формате...
Имя     : =?iso-8859-1?q?=CF=D4=D3=D5=D4=D3=D4=D7=D5=C5=D4?=
Тип     : application/pgp-signature
Размер  : 189 байтов
Описание: =?iso-8859-1?q?=CF=D4=D3=D5=D4=D3=D4=D7=D5=C5=D4?=
Url     : <http://lists.altlinux.org/pipermail/community/attachments/20051012/427b836b/attachment-0003.bin>


Подробная информация о списке рассылки community