From d7d1b0fa6c5e9e048c87dee7309f6350ac3745ba Mon Sep 17 00:00:00 2001 From: raf Date: Fri, 11 Nov 2005 21:33:34 +1100 Subject: [PATCH] 20051111 - Extract long names for attachments inside winmail.dat attachments - Convert bad characters in filenames into underscore rather than dash - Added -S option to override underscore in the above change - Handled Outlook2003's mislabelling of csv as application/vnd.ms-excel - Fixed comment stripping of files for -D and -K option arguments --- CHANGELOG | 8 +++ textmail | 188 ++++++++++++++++++++++++++++++++---------------------- 2 files changed, 120 insertions(+), 76 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 7d888c7..234d19b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,11 @@ +20051111 + + - Extract long names for attachments inside winmail.dat attachments + - Convert bad characters in filenames into underscore rather than dash + - Added -S option to override underscore in the above change + - Handled Outlook2003's mislabelling of csv as application/vnd.ms-excel + - Fixed comment stripping of files for -D and -K option arguments + 20050926 - Use antiword in preference to catdoc for translating msword documents diff --git a/textmail b/textmail index cd39939..5e708d1 100755 --- a/textmail +++ b/textmail @@ -20,7 +20,7 @@ use strict; # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # or visit http://www.gnu.org/copyleft/gpl.html # -# 20050926 raf +# 20051111 raf =head1 NAME @@ -47,6 +47,7 @@ I - mail filter to replace MS Word/HTML attachments with plain text -V - Don't delete video attachments -X - Don't delete MS Windows executable attachments -B - Don't recode text that was base64-encoded + -S ' ' - Replace spaces in filenames with ' ' (default is '_') -O - Delete all application/octet-stream attachments -! - Delete all application/* attachments -D hdrs - Delete headers (list of header prefixes and filenames) @@ -56,13 +57,13 @@ I - mail filter to replace MS Word/HTML attachments with plain text =head1 DESCRIPTION -I filters a mail message, replacing MS Word, MS Excel, HTML, -RTF and PDF attachments with the plain text contained therein. By default, -the following attachments are also deleted: image, audio, video and MS -Windows executables. MS winmail.dat attachments are replaced by their -contents which are then replaced by text or deleted in the same fashion. -Any of these actions can be suppressed with the command line options. -Mail headers can also be selectively deleted. +I filters a mail message, replacing MS Word, MS Excel, HTML, RTF +and PDF attachments with the plain text contained therein. By default, the +following attachments are also deleted: image, audio, video and MS Windows +executables. MS winmail.dat attachments are replaced by their contents which +are then replaced by text or deleted in the same fashion. Any of these +actions can be suppressed with the command line options. Mail headers can +also be selectively deleted. This is useful for increasing the accessibility of mail messages (by reducing their dependence on proprietary file formats), for dramatically @@ -106,89 +107,98 @@ already and ensures that there is a blank line at the bottom of the output. It also performs mailbox quoting on any lines in the body that look like mailbox C headers. Only use this when the output is to be stored directly in a mailbox file. It is not necessary when the output is to be -sent to an SMTP server or when I is being used as a mail filter -by I. +sent to an SMTP server or when I is being used as a mail filter by +I. =item C<-W> -By default, I replaces MS Word attachments with inline plain -text attachments that contain just the plain text within the original -document. This option leaves MS Word attachments intact. +By default, I replaces MS Word attachments with inline plain text +attachments that contain just the plain text within the original document. +This option leaves MS Word attachments intact. =item C<-E> -By default, I replaces MS Excel attachments with CSV file +By default, I replaces MS Excel attachments with CSV file attachments that contain just the data within the original document. This option leaves MS Excel attachments intact. =item C<-H> -By default, I replaces HTML attachments with inline plain text +By default, I replaces HTML attachments with inline plain text attachments that contain just the text within the original document. It also reduces text-versus-html alternative attachments to just the text attachment. This option leaves HTML (and alternative) attachments intact. =item C<-R> -By default, I replaces RTF attachments with inline plain text +By default, I replaces RTF attachments with inline plain text attachments that contain just the plain text within the original document. This option leaves RTF attachments intact. =item C<-P> -By default, I replaces PDF attachments with inline plain text +By default, I replaces PDF attachments with inline plain text attachments that contain just the plain text within the original document. This option leaves PDF attachments intact. =item C<-U> -By default, I replaces MS TNEF (i.e. C) -attachments with the attachments contained therein which are then translated -to text as normal. This option leaves C attachments intact. -This option, together with the C<-!> option will cause winmail.dat -attachments to be deleted rather than translated. +By default, I replaces MS TNEF (i.e. C) attachments +with the attachments contained therein which are then translated to text as +normal. This option leaves C attachments intact. This option, +together with the C<-!> option will cause winmail.dat attachments to be +deleted rather than translated. =item C<-L> -By default, I replaces C attachments -with just the data fork attachment contained therein which is then -translated to text as normal. This option leaves appledouble attachments -intact. However, the data fork attachment will still be translated as normal -resulting in a probably inappropriate and possibly broken resource fork -attachment. Therefore, this option should probably only be used in -conjunction with other options that suppress the translation of the data -fork attachment. +By default, I replaces C attachments with +just the data fork attachment contained therein which is then translated to +text as normal. This option leaves appledouble attachments intact. However, +the data fork attachment will still be translated as normal resulting in a +probably inappropriate and possibly broken resource fork attachment. +Therefore, this option should probably only be used in conjunction with +other options that suppress the translation of the data fork attachment. =item C<-I> -By default, I deletes image attachments. This option leaves -image attachments intact. +By default, I deletes image attachments. This option leaves image +attachments intact. =item C<-A> -By default, I deletes audio attachments. This option leaves -audio attachments intact. +By default, I deletes audio attachments. This option leaves audio +attachments intact. =item C<-V> -By default, I deletes video attachments. This option leaves -video attachments intact. +By default, I deletes video attachments. This option leaves video +attachments intact. =item C<-X> -By default, I deletes attachments containing MS Windows +By default, I deletes attachments containing MS Windows executables. That means C attachments with the following filename extensions: C, C, C, C, C, C, C and C. This option leaves MS Windows executable -attachments intact. To delete C files as well, use the +attachments intact. To delete C files as well, you could use either the +C<-O> option or the C<-!> option. =item C<-B> -By default, when text is encountered that is C-encoded, -I will recode it as either C<7bit> or C, -whichever is appropriate. This option suppresses this recoding. Note that if -the text is large enough and contains a high enough proportion of non-ASCII -characters, it will remain C-encoded to minimise space. +By default, when text is encountered that is C-encoded, I +will recode it as either C<7bit> or C, whichever is +appropriate. This option suppresses this recoding. Note that if the text is +large enough and contains a high enough proportion of non-ASCII characters, +it will remain C-encoded to minimise space. + +=item C<-S> I<' '> + +When translating files, I replaces bad characters such as space +characters with the underscore character. This option lets you specify a +character other than underscore to which bad filename characters will be +converted. In other words, you can use this option to preserve space +characters in attachment filenames (other bad filename characters will then +be converted to spaces as well). =item C<-O> @@ -211,7 +221,7 @@ example, C deletes all headers whose names begin with C. =item C<-K> I -By default, I deletes several types of non-text attachment. The +By default, I deletes several types of non-text attachment. The C<-O> and C<-!> options delete even more. This option specifies, by mimetype and/or filename extension, a list of attachments not to delete. This overrides all deletions. @@ -227,7 +237,7 @@ documents. =item C<-f> -Whenever I is unable to translate any attachment into text, it +Whenever I is unable to translate any attachment into text, it will leave the attachment intact. This happens when the requisite translation software can't be found, when it runs but returns an error code, and when it produces an empty file. This option causes the empty translation @@ -273,37 +283,41 @@ delete windows executables (with output in mailbox format): =head1 REQUIREMENTS MS Word and RTF documents are translated into plain text using -I or I. If I can't find I or +I or I. If I can't find I or I, then MS Word and RTF attachments are left intact. So make sure that I or I is installed and in the C<$PATH>. MS Excel documents are translated into csv files using I. If -I can't find I, then MS Excel attachments are left +I can't find I, then MS Excel attachments are left intact. So make sure that I is installed and in the C<$PATH>. HTML documents are translated into plain text using I. If -I can't find I, then HTML attachments are left intact. -So make sure that I is installed and in the C<$PATH>. +I can't find I, then HTML attachments are left intact. So +make sure that I is installed and in the C<$PATH>. PDF documents are translated into plain text using I. If -I can't find I, then PDF attachments are left +I can't find I, then PDF attachments are left intact. So make sure that I is installed and in the C<$PATH>. -I also requires I and I and I +I also requires I and I and I (which come with I) and I. -If I fails to create a temporary directory, or if it is -instructed to do nothing (i.e. C<-WEHRPULIAVX>), then it degenerates into -I. +If I fails to create a temporary directory, or if it is instructed +to do nothing (i.e. C<-WEHRPULIAVX>), then it degenerates into I. =head1 CAVEAT -If I is unable to create a temporary directory (in C), -then it degenerates into I. Without a temporary directory, no -attachments will be translated or deleted no matter what options (even -C<-f>) were given to I. So make sure that C is writable. -Also make sure that I is available otherwise an insecure -temporary directory will be created. +Mail messages that are signed or encrypted are not translated. + +The latest version of I at the time of writing (i.e. +catdoc-0.93.3) loses data. + +If I is unable to create a temporary directory (in C), then +it degenerates into I. Without a temporary directory, no attachments +will be translated or deleted no matter what options (even C<-f>) were given +to I. So make sure that C is writable. Also make sure that +I is available otherwise an insecure temporary directory will be +created. =head1 SEE ALSO @@ -319,7 +333,7 @@ C =head1 AUTHOR -20050926 raf +20051111 raf =head1 URL @@ -351,6 +365,7 @@ sub help " -V - Don't delete video attachments\n", " -X - Don't delete MS Windows executable attachments\n", " -B - Don't recode text that was base64-encoded\n", + " -S ' ' - Replace spaces in filenames with ' ' (default is '_')\n", " -O - Delete all application/octet-stream attachments\n", " -! - Delete all application/* attachments\n", " -D hdrs - Delete headers (list of header prefixes and filenames)\n", @@ -654,7 +669,7 @@ sub filename # rfc2183, rfc2045? { my $p = shift; my $fn = param($p, 'content-disposition', 'filename') || param($p, 'content-type', 'name') || 'attachment' . ++$unique; - $fn =~ s/^.*[\\\/]//, $fn =~ tr/\x00-\x1f !"#\$%&'()*\/:;<=>?@[\\]^`{|}~\x7f/-/s; + $fn =~ s/^.*[\\\/]//, $fn =~ tr/\x00-\x1f !"#\$%&'()*\/:;<=>?@[\\]^`{|}~\x7f/_/s; return $fn; } @@ -694,7 +709,7 @@ sub newmail # rfc2822, rfc2045, rfc2046, rfc2183 (also rfc3282, rfc3066, rfc2424 my $type = $a{type} || (exists $a{parts} ? 'multipart/mixed' : exists $a{message} ? 'message/rfc822' : 'text/plain'); my $multi = $type =~ /^multipart\//i; my $msg = $type =~ /^message\/rfc822$/i; - ($a{body}, $a{modified}, $a{read}, $a{size}) = (do { local $/; my $b = ; close F; $b }, rfc822date((stat _)[9]), rfc822date((stat _)[8]), (stat _)[7]) if exists $a{filename} && !exists $a{body} && !exists $a{message} && !exists $a{parts} && -r $a{filename} && stat($a{filename}) && open F, $a{filename}; + ($a{body}, $a{modified}, $a{read}, $a{size}) = (do { local $/; my $b = ; close F; $b }, exists $a{modified} ? $a{modified} : rfc822date((stat _)[9]), exists $a{read} ? $a{read} : rfc822date((stat _)[8]), (stat _)[7]) if exists $a{filename} && !exists $a{body} && !exists $a{message} && !exists $a{parts} && -r $a{filename} && stat($a{filename}) && open F, $a{filename}; ($a{filename}) = $a{filename} =~ /([^\\\/]+)$/ if $a{filename}; my $bound = $multi ? join '', map { substr $bchar, int(rand(length $bchar)), 1 } 0..30 : ''; my $disp = $a{disposition} || ($type =~ /^(?:text\/|message\/rfc822)/i ? 'inline' : 'attachment'); @@ -809,11 +824,13 @@ sub add_mimetypes sub MESSAGE { 1 } sub ATTACHMENT { 2 } -sub MESSAGE_CLASS { 7 << 16 | 0x8008 } -sub ATTACH_DATA { 6 << 16 | 0x800F } -sub ATTACH_FILENAME { 1 << 16 | 0x8010 } -sub ATTACH_RENDDATA { 6 << 16 | 0x9002 } -sub VERSION { 8 << 16 | 0x9006 } +sub MESSAGE_CLASS { 0x00078008 } +sub ATTACH_ATTACHMENT { 0x00069005 } +sub ATTACH_DATA { 0x0006800f } +sub ATTACH_FILENAME { 0x00018010 } +sub ATTACH_RENDDATA { 0x00069002 } +sub ATTACH_MODIFIED { 0x00038013 } +sub VERSION { 0x00089006 } my $data; my @attachment; my $attachment; my $pos; sub winmail @@ -834,6 +851,7 @@ sub winmail my $id = unpack 'V', substr $data, $pos + 1, 4; return unless $id == MESSAGE_CLASS; $pos += 5; my $len = unpack 'V', substr $data, $pos, 4; $pos += 4; + return 0 if $pos + $len > length $data; my $buf = substr($data, $pos, $len); $pos += $len; my $chk = unpack 'v', substr $data, $pos, 2; $pos += 2; my $tot = unpack '%16C*', $buf; @@ -846,6 +864,7 @@ sub winmail return 0 unless defined $type && $type == MESSAGE; ++$pos; my $id = unpack 'V', substr $data, $pos, 4; $pos += 4; my $len = unpack 'V', substr $data, $pos, 4; $pos += 4; + return 0 if $pos + $len > length $data; my $buf = substr($data, $pos, $len); $pos += $len; my $chk = unpack 'v', substr $data, $pos, 2; $pos += 2; my $tot = unpack '%16C*', $buf; @@ -859,15 +878,29 @@ sub winmail my $id = unpack 'V', substr $data, $pos, 4; $pos += 4; push @attachment, $attachment = {} if $id == ATTACH_RENDDATA; my $len = unpack 'V', substr $data, $pos, 4; $pos += 4; + return 0 if $pos + $len > length $data; my $buf = substr($data, $pos, $len); $pos += $len; my $chk = unpack 'v', substr $data, $pos, 2; $pos += 2; my $tot = unpack '%16C*', $buf; return 0 unless $chk == $tot; - $attachment->{body} = $buf, $attachment->{size} = length($buf) if $id == ATTACH_DATA; $buf =~ s/[\0\s]+$//; - $attachment->{filename} = $buf, $attachment->{type} = $mimetype{($attachment->{filename} =~ /\.([^.]+)$/) || 'other'} || 'application/octet-stream' if $id == ATTACH_FILENAME; + $attachment->{body} = $buf, $attachment->{size} = length $buf if $id == ATTACH_DATA; + $buf =~ s/\x00+$//, $attachment->{filename} = $buf, $attachment->{type} = $mimetype{($attachment->{filename} =~ /\.([^.]+)$/) || 'other'} || 'application/octet-stream' if $id == ATTACH_FILENAME && !exists $attachment->{filename}; + my $fname; $attachment->{filename} = $fname, $attachment->{type} = $mimetype{($attachment->{filename} =~ /\.([^.]+)$/) || 'other'} || 'application/octet-stream' if $id == ATTACH_ATTACHMENT && ($fname = realname($buf)); + use POSIX; sub word { unpack 'v', substr($_[0], $_[1] * 2, 2) } + $attachment->{modified} = strftime '%a, %d %b %Y %H:%M:%S +0000', gmtime mktime word($buf, 5), word($buf, 4), word($buf, 3), word($buf, 2), word($buf, 1) - 1, word($buf, 0) - 1900 if $id == ATTACH_MODIFIED; return 1; } + sub realname + { + my $buf = shift; + my $pos = index $buf, "\x1e\x00\x01\x30\x01"; return unless $pos >= 0; $pos += 8; + my $len = unpack 'V', substr($buf, $pos, 4); $pos += 4; + my $name = substr($buf, $pos, $len) or return; + $name =~ s/\x00+$//; + return $name; + } + my $m = shift; $pos = 0; $data = body($m); @attachment = (); my $signature = unpack 'V', substr($data, $pos, 4); $pos += 4; @@ -885,7 +918,7 @@ sub winmail my %opt; use Getopt::Std; -help unless getopts 'hmrwMWEHRPLUIAVXBO!D:K:f?', \%opt; +help unless getopts 'hmrwMWEHRPLUIAVXBS:O!D:K:f?', \%opt; help if exists $opt{h}; man if exists $opt{m}; nroff if exists $opt{r}; @@ -912,6 +945,7 @@ my $remove_audio = ! exists $opt{A}; my $remove_video = ! exists $opt{V}; my $remove_exe = ! exists $opt{X}; my $recode_base64_text = ! exists $opt{B}; +my $replace_space = $opt{S} if exists $opt{S}; my $remove_octet = exists $opt{O}; my $remove_application = exists $opt{'!'}; my $remove_headers = exists $opt{D}; @@ -923,7 +957,7 @@ chop(my $tmp = `$mktemp -dq /tmp/textmail.XXXXXX`) if $removing && defined $mkte if (!$removing || (($? || !defined $tmp || ! -d $tmp) && !mkdir($tmp = "/tmp/textmail.$$", 0700))) { exec '/bin/cat' or print STDERR ''; # suppress warning - print while (); # slow cat if exec fails + print do { undef $/; }; # slow cat if exec fails exit; }; @@ -1122,14 +1156,16 @@ sub translate return $part if !defined $cmd && !$force; my $origpath = filename($part); - $origpath .= '.' . $ext[0] unless $origpath =~ /\.(?:@{[join '|', @ext]})$/i; + $origpath =~ s/_+/$replace_space/g if defined $replace_space; + $origpath .= '.' . $ext[0] unless $origpath =~ /\.(?:@{[join '|', @ext, $fmt]})$/i; my $textpath = $origpath; $textpath =~ s/\.(?:@{[join '|', @ext]})$/.$fmt/i; - $textpath .= ".$fmt" if $textpath eq $origpath; + $textpath .= ".$fmt" if $textpath eq $origpath && $textpath !~ /\.$fmt$/i; return newmail(filename => $textpath, body => '') if !defined $cmd && $force; my $origdata = body($part); open A, ">$tmp/$origpath" and do { print A $origdata; close A }; - my $failed = system($cmd . ' ' . quotemeta("$tmp/$origpath") . ' > ' . quotemeta("$tmp/$textpath")) || -s $origpath && -z $textpath; unlink "$tmp/$origpath"; + my $failed = $origpath ne $textpath && system($cmd . ' ' . quotemeta("$tmp/$origpath") . ' > ' . quotemeta("$tmp/$textpath")) || -s "$tmp/$origpath" && -z "$tmp/$textpath"; + unlink "$tmp/$origpath" unless $origpath eq $textpath; unlink("$tmp/$textpath"), return $part if $failed && !$force; $part = newmail(filename => "$tmp/$textpath"); unlink "$tmp/$textpath"; return $part; @@ -1163,7 +1199,7 @@ sub get_file while () { - s/#.*$//, s/^\s+//, s/\s+$//, next unless $_; + s/#.*$//, s/^\s+//, s/\s+$//; next unless $_; push @list, $_; }