sub strip_html {
$HTML_Text = shift;
$HTML_Text =~ s/\ \;/ /gi;
# strip HTML tags that contain an ALT="foo" and replace with the literal string "foo":
$HTML_Text =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/ $1 /ig;
# The following code strips everything inside <SCRIPT..>...</SCRIPT> tags out of the HTML text:
$NoScript = '';
foreach (split(m!(\<\/SCRIPT>|\<\/STYLE>)!i, $HTML_Text)) {
next unless $_;
if (m!^(.*)(\<SCRIPT|\<STYLE)!i) {
$NoScript .= ' '.$1;
}
else {
$NoScript .= ' '.$_;
}
}
$HTML_Text = $NoScript;
$HTML_Text = &entity_strip($HTML_Text);
$HTML_Text =~ s!<([^>]*?)>!g;# strip all HTML tag and replace with blank spaces:
$HTML_Text =~ s!(\W|\_)!g;# Strip non-alphanumerics and underscores:
#print "$HTML_Text\n\n";
return($HTML_Text);
}
sub entity_strip {
my $t =shift;
@entity = (
"lt", "gt", "amp", "quot", "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", "macr", "deg", "plusmn", "sup2", "sup3", "acute",
"micro", "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute",
"Icirc", "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig",
"ccedil", "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", "uuml", "yacute", "thorn","yuml");
foreach $i (@entity) {
$t =~ s/\&$i/ /gex;
}
return($t)
}
IncrediBILLAny chance you want to do this in PHP as I have some easy solutions for your problem, including code, just not in PERL.
Either way, you should use the IMAP interface which allows you to open the mailbox and download the content whether you're hosting it and have direct access or whether it's on a different server than the website.
Accessing the mailbox files directly is frowned upon and usually not possible in most servers because of the security settings which jail the software to the user account making IMAP the way to go.
$HTML_Text =~ s!<([^>]*?)>!g;# strip all HTML tag and replace with blank spaces:
$HTML_Text =~ s!(\W|\_)!g;# Strip non-alphanumerics and underscores:
$HTML_Text =~ s/<([^>]*?)>/ /g;
$HTML_Text =~ s!(\W|\_)!g
sub strip_html {
$HTML_Text = shift;
$HTML_Text =~ s/\ \;/ /gi;
### ADDED #################################
$HTML_Text=~ s/=20|=\?.*?Q\?|=\?.*?q\?|\?=/ /g;
$HTML_Text=~ s/=E1/a/g;
$HTML_Text=~ s/=E9/e/g;
$HTML_Text=~ s/=ED/i/g;
$HTML_Text=~ s/=F3|ó/o/g;
$HTML_Text=~ s/=B4/u/g;
$HTML_Text=~ s/=F1/ñ/g;
$HTML_Text=~ s/=2C/\,/g;
$HTML_Text=~ s/=3B/\;/g;
$HTML_Text=~ s/^ //g;
###########################################
$HTML_Text =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/ $1 /ig;
$NoScript = '';
foreach (split(m!(\<\/SCRIPT>|\<\/STYLE>)!i, $HTML_Text)) {
next unless $_;
if (m!^(.*)(\<SCRIPT|\<STYLE)!i) {$NoScript .= ' '.$1;}else {$NoScript .= ' '.$_;}
}
$HTML_Text = $NoScript;
$HTML_Text = &entity_strip($HTML_Text);
$HTML_Text =~ s/<([^>]*?)>/ /g;
############
$HTML_Text=~ s/Content-Transfer-Encoding\: quoted-printable/\|/gi;
($p1,$p2) = split(/\|/,$HTML_Text);$HTML_Text=$p2;
$HTML_Text =~ s/°|=\n|Content-Type\: text\/html\; charset=\".*?\"|--\_.*?\_|>\n\n//g;
$HTML_Text =~ s/\n+/\n/g;
$HTML_Text =~ s/To\:/\n---------------------------------------------\n\nTo\: /;
############
return($HTML_Text);
}
sub entity_strip {
my $t =shift;
@entity = ("lt", "gt", "amp", "quot", "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig", "ccedil", "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", "uuml", "yacute", "thorn","yuml");
foreach $i (@entity) {$t =~ s/\&$i/ /gex;}
return($t)
}
[edited by: phranque at 12:35 am (utc) on Aug 15, 2013]
[edit reason] disabled graphic smileys [/edit]