$str = 'this is some foo bar crap';
# real data comes from MySQL, but for simplicity...
%pattern = (
'foo' => '****',
'bar' => '***'
);
for (keys %pattern) {
$str =~ /$_\b/$pattern{$_}/gi;
} %pattern = (
'f\W*o\W*o' => '****',
'b\W*a\W*r' => '***'
); # typed for this post, not tested
$newStr = $str =~ s/\W//g;
for (keys %pattern) {
if ($newStr =~ /$_\b/i) {
# but then how do I know what to replace in $str?
}
}
but then how do I know what to replace in $strWouldn’t you still be running the same word filter? The only difference is whether you’re excluding the \W beforehand or concurrently.
If both approaches are equally easy for you to read and debug, is there any noticeable difference in speed?
\b((?:ba+d+|candy|dum+b*|fa+t+|hot|jack|smart|stupid|lard)\W*)?[a@][sz\$][sz\$](?:clown|hat|heat|h+o+l+e|wipe)?(?!et|ert|ist|um|ess|ign|ure|ault) \b((?:b\W*a+\W*d+|c\W*a\W*n\W*d\W*y|d\W*u\WebmasterWorld*+b*|f\W*a+\W*t+|h\W*o\W*t|j\W*a\W*c\W*k|s\WebmasterWorld*a\W*r\W*t|s\W*t\W*u\W*p\W*i\W*d|l\W*a\W*r\W*d)\W*)?[a@]\W*[sz\$]\W*[sz\$](?:c\W*l\W*o\W*w\W*n|h\W*a\W*t|h\W*e\W*a\W*t|h+\W*o+\W*l+\W*e|w\W*i\W*p\W*e)?(?!et|ert|ist|um|ess|ign|ure|ault) Wouldn’t you still be running the same word filter? The only difference is whether you’re excluding the \W beforehand or concurrently.
$str = 'this is some f-o-o bar crap';
$newStr = $str =~ s/\W//g;
# Result: "thisissomefoobarcrap"
for (keys %pattern) {
# "foo" matches
if ($newStr =~ /$_/i) {
# so I know that the problem word exists, but I don't know how
# to tell it to use that info to replace f-o-o
}
} A more challenging question is how to distinguish spurious \W insertions from legitimate word breaks: I assume you wouldn’t want to filter someone who said “And then you wash it in vinegar”, while retaining the ability to filter someone who says “f o o b a r”
It also of course depends on how large the forum-or-whatever is. Is it too large to manually keep track of people who deliberately try to bypass word filters, and seek them out for slaps on the wrist?
What about people who sneakily replace one or more letters with a non-ASCII lookalike? Or is it an Old Skool site like the present one, where anything outside 1252 gets entitized?
$newStr = $str =~ s/\W|<[^>+]//g;
for (keys %pattern) {
if ($newStr =~ /$_/i) {
# email me
last;
}
} $str = 'this is some f-o-o bar crap';
if ($str =~ /(f\W*o\W*o|lorem|ipsum)/i) {
print $1;
} # DELIMIT PATTERN WITH \W* || <[^>]+> BETWEEN EACH CHARACTER TO CATCH PEOPLE
# TRYING TO GET PASS THE FILTER
# Example, using 'f-o-o' or 'f<br>o<br>o' instead of 'foo'
#
# Notes:
# 1. I'm using the /x modifier throughout to make it easier to read, but keep
# in mind that this negates spaces so it might throw things off in practice
#
# 2. \Q ... \E escapes patterns in regexes, so I used that when I needed to
# escape several things in a row
# the test string
$_ = 'this is $ome+hing like some f-o-o bar crap';
# use single quotes to define everything literally, but remember that regex
# reads it as double quotes
$pattern = 'f[o0]o|[$s]ome(?:[t+]hing)?|(?!this|that)other';
# Step 1, I'm going to use ï as a delimiter so remove it from $_ to
# ensure no accidental matches later
s/ï/i/;
# Step 2, find anything between [ .. ] or (?! .. ) in $pattern, add it to
# %temp, then replace it in $pattern with ï$xï
$x = 1;
while (
$pattern =~ m{(
\[[^]]+\] |
\Q(?!\E[^)]+\Q)\E
)}x
# safety net, real value will be number of potential matches in $pattern
&& $x < 10) {
$temp{$x} = $1;
$pattern =~ s/\Q$1\E/ï$xï/;
$x++;
}
# at this point:
# $pattern = 'fï1ïo|ï2ïome(?:ï3ïhing)?|ï4ïother';
# $temp{'1'} = '[o0]';
# $temp{'2'} = '[$s]';
# $temp{'3'} = '[t+]';
# $temp{'4'} = '(?this_that)';
# Step 3, replace all optional whitespace with (\W|<[^>]+>)*
$delimiter = '(?:\\W|<[^>]+>)*';
$pattern =~ s/\s[?*]/$delimiter/g;
# Step 4, substitute all \w with \w(\W|<[^>]+>)* in $pattern unless it's
# followed by *, ?, (, |, at the end of the string (\Z), or
# ), )?, )* AND followed by | or \Z
$pattern =~ s{
(
ï\dï |
[a-z]
)
(?!
[*?(|] |
\Z |
\)[?*]?(
\| |
\Z
)
)
}
{$1$delimiter}xg;
# at this point:
# $pattern = 'f(?:\W|<[^>]+>)*ï1ï(?:\W|<[^>]+>)*o|ï2ï(?:\W|<[^>]+>)*o(?:\W|<[^>]+>)*m(?:\W|<[^>]+>)*e(?:ï3ï(?:\W|<[^>]+>)*h(?:\W|<[^>]+>)*i(?:\W|<[^>]+>)*n(?:\W|<[^>]+>)*g)?|ï4ï(?:\W|<[^>]+>)*o(?:\W|<[^>]+>)*t(?:\W|<[^>]+>)*h(?:\W|<[^>]+>)*e(?:\W|<[^>]+>)*r';
# Step 5, go back and replace :ï:1:ï: with their original
for (sort keys %temp) {
$pattern =~ s/ï$_ï/$temp{$_}/g;
}
# at this point:
# $pattern = 'f(?:\W|<[^>]+>)*[o0](?:\W|<[^>]+>)*o|[$s](?:\W|<[^>]+>)*o(?:\W|<[^>]+>)*m(?:\W|<[^>]+>)*e(?:[t+](?:\W|<[^>]+>)*h(?:\W|<[^>]+>)*i(?:\W|<[^>]+>)*n(?:\W|<[^>]+>)*g)?|(?!this|that)(?:\W|<[^>]+>)*o(?:\W|<[^>]+>)*t(?:\W|<[^>]+>)*h(?:\W|<[^>]+>)*e(?:\W|<[^>]+>)*r';
# Step 6, $pattern is done so let's do it
# Note, I'm not sure if using \b here is necessary, but it doesn't hurt
# I found that \b[$s] didn't match like expected, so lucy24 gave me
# the (^|\W|\$) alternative idea
s/(\b|^|\W|\$)(?:$pattern)(?:e[dr]|ing?|e?s|y)?\b/$1****/xgi;
print;
# Returns:
# this is **** like **** **** bar crap
# the test string
$_ = 'this is $ome+hing like some f-o-o bar crap';
# the test pattern
$pattern = 'f[o0]o|[$s]ome(?:[t+]hing)?|(?!this|that)other';
# I added these suffixes here so that they'll be delimited, too
$pattern = '(?:' . $pattern . ')(?:e[dr]|ing?|e?s|y)?';
@more =
$pattern =~ m{(
\[[^]]+\] |
\Q(?ï<\E[^)]+\Q)\E |
\Q(?!\E[^)]+\Q)\E
)}xg;
unshift(@more, '');
for ($x = 1; $x <= $#more; $x++) {
$hash{$more[$x]} = 'ï' . $x . 'ï';
$temp{$x} = $more[$x];
if ($patternFix) { $patternFix .= '|'; }
$patternFix .= '(' . quotemeta($more[$x]) . ')';
}
# at this point
# $patternFix = '(\[o0\])|(\[\\\$s\])|(\[t\+\])|(\(\?\!this\|that\))|(\[dr\])';
$pattern =~ s/$patternFix/$hash{$+}/gi;
# at this point
$pattern = '(?:fï1ïo|ï2ïome(?:ï3ïhing)?|ï4ïother)(?:eï5ï|ing?|e?s|y)?';
$delimiter = '(?:\\W|<[^>]+>)*';
$pattern =~ s/\s[?*]/$delimiter/g;
$pattern =~ s{
(
ï\dï |
[a-z]
)
(?!
[*?(|] |
\)[?*]? |
\Z
)
}
{$1$delimiter}xg;
$pattern =~ s/ï(\d+)ï/$temp{$+}/gi;
s/(\b|^|\W|\$)(?:$pattern)\b/$1$2****/gi;
print;
# Returns:
# this is **** like **** **** bar crap # the test pattern I'm using
$pattern = 'f[o0]o|[$s]ome(?:[t+]hing)?|(?!this|that)other';
# this pushes $1 of the regex match to the @more array
# so anything in $pattern that matches [...], (?<...>, or (?!...) is
# pushed to @more
@more =
$pattern =~ m{(
\[[^]]+\] |
\Q(?<\E[^>]+\Q>\E |
\Q(?!\E[^)]+\Q)\E
)}xg;
# since regex starts at $1 instead of 0, I'm adding a blank
# index to the [0] position
unshift(@more, '');
# loop from 1 to the length of @more
for ($x = 1; $x <= $#more; $x++) {
# add to a %hash associative array
# since the first match in $pattern should be [o0], $more[1]
# should equal [o0]. So now:
# $hash{'[o0]'} = 'ï1ï';
# $temp{'1'} = '[o0]';
$hash{$more[$x]} = 'ï' . $x . 'ï';
$temp{$x} = $more[$x];
# quotemeta() auto-quotes meta characters, so now:
# $patternFix = '(\[o0\])';
if ($patternFix) { $patternFix .= '|'; }
$patternFix .= '(' . quotemeta($more[$x]) . ')';
# on the next loop, $patternFix would become:
# $patternFix = '(\[o0\])|(\[\\\$s\])';
} $pattern =~ s/$patternFix/$hash{$+}/gi; (I assume the last bit is the flags for “global” and “case-insensitive”.)
[edited by: phranque at 8:45 pm (utc) on Oct 20, 2021]
[edit reason] disable graphic smile faces [/edit]
# assuming that $_ and $pattern are set previously
s/ï/i/;
# this is marginally faster than the while() statement I'd
# used earlier
@more =
$pattern =~ m{(
# catch [...]
\[.+?\] |
# catch group names, (?<name>foo)
\Q(?<\E.+?\Q>\E |
# negative lookahead
# I don't use positive lookahead anywhere, but if I did
# then it could be added here
\Q(?!\E.+?\Q)\E
)}xg;
unshift(@more, '');
my %temp;
for ($x = 1; $x <= $#more; $x++) {
$temp{$x} = $more[$x];
$pattern =~ s/\Q$more[$x]\E/ï$xï/g;
}
$delimiter = '(?:\W|<.+?>)*';
$pattern =~ s{
(
ï\dï |
[a-z]
)
(?!
\( |
\| |
\] |
\)[?*] |
\Z
)
}
{$1$delimiter}xg;
for (sort keys %temp) {
$pattern =~ s/ï$_ï/$temp{$_}/g;
}
# I discovered that \w+ and [ab]+ were becoming
# [ab](?:\W|<[^>]+>)*+, so this converts it back to
# (?:[ab](?:\W|<[^>]+>)*)+
$pattern =~ s{(\w|\[[^]]+\])\Q$delimiter\E(\s*)([?+*])}
{(?:$1$delimiter)$3$2}g;
s/(\b|^|\W|\$)$pattern\b/$1****/gi; since regex starts at $1 instead of 0If perl works the same as Apache--the only context in which I have occasion to use $0--then strictly speaking $1 through $9 are the captures, while $0 is the entire matched pattern. For example, if the pattern is
$& returns the entire matched string. (At one point $0 did also, but now it returns the name of the program.)
If your code is to run on Perl 5.16 or earlier, beware that once Perl sees that you need one of $&, $` [the right angle apostrophe], or $' anywhere in the program, it has to provide them for every pattern match. This may substantially slow your program.