X
Xah Lee
would anyone like to translate the following perl script to Python or
Scheme (scsh)?
the file takes a inpath, and report all html files in it above certain
size. (counting inline images)
also print a sorted report of html files and their size.
(a copy of the script is here:
http://xahlee.org/_scripts/check_file_size.pl
)
Xah
(e-mail address removed)
∑ http://xahlee.org/
# perl
# Tue Oct 4 14:36:48 PDT 2005
# given a dir, report all html file's size. (counting inline images)
# XahLee.org
use Data:umper;
use File::Find;
use File::Basename;
$inpath = '/Users/t/web/mydirectory/';
$sizeLimit = 800 * 1000;
# $inpath = $ARGV[0]; # should give a full path; else the
$File::Find::dir won't give full path.
while ($inpath =~ m@^(.+)/$@) { $inpath = $1;} # get rid of trailing
slash
die "dir $inpath doesn't exist! $!" unless -e $inpath;
##################################################
# subroutines
# getInlineImg($file_full_path) returns a array that is a list of
inline images. For example, it may return ('xx.jpg','../image.png')
sub getInlineImg ($) { $full_file_name= $_[0];
@linx =(); open (FF, "<$full_file_name") or die "error: can not open
$full_file_name $!";
while (<FF>) { @txt_segs = split(m/img/, $_); shift @txt_segs;
for $lin (@txt_segs) { if ($lin =~ m@ src\s*=\s*\"([^\"]+)\"@i) {
push @linx, $1; }}
} close FF;
return @linx;
}
# linkFullPath($dir,$locallink) returns a string that is the full path
to the local link. For example,
linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns
'Users/t/public_html/a/image/t.png'. The returned result will not
contain double slash or '../' string.
sub linkFullPath($$){ $result=$_[0] . $_[1]; while ($result =~
s@\/\/@\/@) {}; while ($result =~ s@/[^\/]+\/\.\.@@) {}; return
$result;}
# listLocalLinks($html_file_full_path) returns a array where each
element is a full path of local links in the html.
sub listLocalLinks($) {
my $htmlfile= $_[0];
my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') );
my @aa = getlinks($htmlfile);
@aa = grep(!m/\#/, @aa);
@aa = grep (!m/^mailto:/, @aa);
@aa = grep (!m/^http:/, @aa);
my @linkedFiles=();
foreach my $lix (@aa) { push @linkedFiles, linkFullPath($dir,$lix);}
return @linkedFiles;
}
# listInlineImg($html_file_full_path) returns a array where each
element is a full path to inline images in the html.
sub listInlineImg($) {
my $htmlfile= $_[0];
my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') );
my @aa = getInlineImg($htmlfile);
my @result=();
foreach my $ele (@aa) { push @result, linkFullPath($dir,$ele);}
return @result;
}
##################################################
sub checkLink {
if (
-T $File::Find::name
&& $File::Find::name =~ m@\.html$@
) {
$total= -s $File::Find::name;
@h2 = listInlineImg($File::Find::name);
for my $ln (@h2) {$total += -s $ln;};
if ( $total > $sizeLimit) {print "problem: file:
$File::Find::name, size: $total\n";}
push (@result, [$total, $File::Find::name]);
};
}
find(\&checkLink, $inpath);
@result = sort { $b->[0] <=> $a->[0]} @result;
print Dumper(\@result);
print "done reporting. (any file above size are printed above.)";
__END__
Scheme (scsh)?
the file takes a inpath, and report all html files in it above certain
size. (counting inline images)
also print a sorted report of html files and their size.
(a copy of the script is here:
http://xahlee.org/_scripts/check_file_size.pl
)
Xah
(e-mail address removed)
∑ http://xahlee.org/
# perl
# Tue Oct 4 14:36:48 PDT 2005
# given a dir, report all html file's size. (counting inline images)
# XahLee.org
use Data:umper;
use File::Find;
use File::Basename;
$inpath = '/Users/t/web/mydirectory/';
$sizeLimit = 800 * 1000;
# $inpath = $ARGV[0]; # should give a full path; else the
$File::Find::dir won't give full path.
while ($inpath =~ m@^(.+)/$@) { $inpath = $1;} # get rid of trailing
slash
die "dir $inpath doesn't exist! $!" unless -e $inpath;
##################################################
# subroutines
# getInlineImg($file_full_path) returns a array that is a list of
inline images. For example, it may return ('xx.jpg','../image.png')
sub getInlineImg ($) { $full_file_name= $_[0];
@linx =(); open (FF, "<$full_file_name") or die "error: can not open
$full_file_name $!";
while (<FF>) { @txt_segs = split(m/img/, $_); shift @txt_segs;
for $lin (@txt_segs) { if ($lin =~ m@ src\s*=\s*\"([^\"]+)\"@i) {
push @linx, $1; }}
} close FF;
return @linx;
}
# linkFullPath($dir,$locallink) returns a string that is the full path
to the local link. For example,
linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns
'Users/t/public_html/a/image/t.png'. The returned result will not
contain double slash or '../' string.
sub linkFullPath($$){ $result=$_[0] . $_[1]; while ($result =~
s@\/\/@\/@) {}; while ($result =~ s@/[^\/]+\/\.\.@@) {}; return
$result;}
# listLocalLinks($html_file_full_path) returns a array where each
element is a full path of local links in the html.
sub listLocalLinks($) {
my $htmlfile= $_[0];
my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') );
my @aa = getlinks($htmlfile);
@aa = grep(!m/\#/, @aa);
@aa = grep (!m/^mailto:/, @aa);
@aa = grep (!m/^http:/, @aa);
my @linkedFiles=();
foreach my $lix (@aa) { push @linkedFiles, linkFullPath($dir,$lix);}
return @linkedFiles;
}
# listInlineImg($html_file_full_path) returns a array where each
element is a full path to inline images in the html.
sub listInlineImg($) {
my $htmlfile= $_[0];
my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') );
my @aa = getInlineImg($htmlfile);
my @result=();
foreach my $ele (@aa) { push @result, linkFullPath($dir,$ele);}
return @result;
}
##################################################
sub checkLink {
if (
-T $File::Find::name
&& $File::Find::name =~ m@\.html$@
) {
$total= -s $File::Find::name;
@h2 = listInlineImg($File::Find::name);
for my $ln (@h2) {$total += -s $ln;};
if ( $total > $sizeLimit) {print "problem: file:
$File::Find::name, size: $total\n";}
push (@result, [$total, $File::Find::name]);
};
}
find(\&checkLink, $inpath);
@result = sort { $b->[0] <=> $a->[0]} @result;
print Dumper(\@result);
print "done reporting. (any file above size are printed above.)";
__END__