Forum Moderators: phranque
$string = <<<EOF
Testing, testing, 1, 2, 3<br>
<img src="http://example.com/foo_a.jpg" width="130" height="86" alt="foo_a" align="left" title="fooA" border="0">
<img width="180" src="http://example.com/foo_b.jpg" height="120" alt="foo_b" align="left" title="fooB" border="0" style="margin: 10px">
<img src="http://example.com/foo_c.jpg" style="width: 210px; height: 175px" alt="foo_c" align="left" title="fooC" border="0">
<img src="http://example.com/foo_d.jpg">
<img src="http://example.com/foo_e.jpg" width="1" height="1" border="0"> preg_match_all('#
<img
.*?
(?:
width\s?=\s?
("|\')
([0-9]+)
\1
)
.*?
src\s?=\s?
("|\')
(
http[^\3]+?
(?!feeds.feedburner.com) // I don't want to save images from feedburner.com
[^\3]*?
)
\3
.*?
(?:
width\s?=\s?
("|\')
([0-9]+)
\5
)
.*?
(?:
style\s?=\s?
("|\')
[^\7]*
width\s?:\s?
([0-9]+)
px
[^\7]*
\7
)
.*?
>
#msix',
$string, $matches);
print_r($matches) <img +
(?:(?!src|width)\w+ *= *"[^"]*" *)*
(src|width) *= *"([^"]*)" *
(?:(?!src|width)\w+ *= *"[^"]*" *+)*
(?:(src|width) *= *"([^"]*)")? *
(?:(?!src|width)\w+ *= *"[^"]*" *)*
>
i.e. <img +(?:(?!src|width)\w+ *= *"[^"]*" *)*(src|width) *= *"([^"]*)" *(?:(?!src|width)\w+ *= *"[^"]*" *+)*(?:(src|width) *= *"([^"]*)")? * *(?:(?!src|width)\w+ *= *"[^"]*" *)*>I'll let that marinade for a while so the errors and oversights can jump up and hit me in the face. It gives you four captures, which can be src or width in either order. Is this arising from one of those UGC sites where it is genuinely out of your control to make the <img> tags have all their attributes in the same order all the time every time? Feh.
First experiment, broken into separate lines for conceptual purposes:
Seems like a simple HTML parser would be the way to go with this.
I took your code and just pasted it inOops, cut-and-paste booboo on my part. Should be:
<img +(?:(?!src|width)\w+ *= *"[^"]*" *)*(src|width) *= *"([^"]*)" *(?:(?!src|width)\w+ *= *"[^"]*" *+)*(?:(src|width) *= *"([^"]*)")? *(?:(?!src|width)\w+ *= *"[^"]*" *)*>
though that shouldn't make any difference. // I tried to create every potential variation I could think of
$string = <<<EOF
<img src="http://example.com/aaa.jpg" width="230" height="90" border="0">
<img width="140" height="100" src="http://example.com/bbb.jpg" border="0">
<img src="http://example.com/ccc.jpg" border="0" style="width: 150px">
<img src="http://example.com/ddd.jpg" width="160" height="120" border="0">
<img src="http://example.com/eee.jpg" data-foo data-bar="blah" width="170" border="0">
<img border="0" src="http://example.com/fff.jpg" width="180" height="140">
<img border="0" src="http://example.com/ggg.jpg" style="width: 190px; height: 150px" width="200" height="160">
<img style="width: 210px; height: 170px" width="220" height="180" border="0" src="http://example.com/hhh.jpg">
EOF;
// I'm gonna put some notes in here, but obviously they would have to be
// removed before using this
preg_match_all('#
<img\s+
// realizing that I have to allow for data-attribute without a value, I went
// with "anything that's not a >"
[^>]*?
(?:
// match src, width, or style
(src|width|style)\s*=\s*
("|\')
(?:
// I'll only see this pattern if I'm matching "style"
width\s*:\s*
([0-9]+)
px
)?
([^\2]*?)
\2
)
// same as above
[^>]*?
(?:
(src|width|style)\s*=\s*
("|\')
(?:
width\s*:\s*
([0-9]+)
px
)?
([^\6]*?)
\6
)
// here's where it got complicated. I couldn't just match the pattern a third
// time and make it optional or it broke when the third parameter didn't
// exist. So instead I'm matching everything, then working with it in PHP
// if necessary
([^>]*?)
>
#msix',
$string, $matches);
// you can print $matches here to see the indexes that I used below
// print_r($matches);
// this array isn't necessary for me, but I originally set everything to an
// associative array so I'm leaving it here for posterity
$fixed = array();
// the maximum width I'm willing to accept
$finalWidth = 500;
// the minimum width I'm willing to accept
$minimum = 80;
// I use these to find the smallest picture in the list that's larger than
// the value for $minimum
$finalSrc =
$thisSrc =
$thisWidth = false;
$counter = count($matches[1]);
for ($i = 0; $i < $counter; $i++) {
// src comes first and width parameter is set
if ($matches[1][$i] === 'src' &&
$matches[5][$i] === 'width' &&
$matches[8][$i] &&
$matches[8][$i] > 0) {
$thisSrc = $matches[4][$i];
$thisWidth = $matches[8][$i];
$fixed[$matches[4][$i]] = $matches[8][$i];
}
// width parameter comes first
else if (
$matches[1][$i] === 'width' &&
$matches[5][$i] === 'src' &&
$matches[4][$i] &&
$matches[4][$i] > $minimum) {
$thisSrc = $matches[8][$i];
$thisWidth = $matches[4][$i];
$fixed[$matches[8][$i]] = $matches[4][$i];
}
// no width parameter, but do have style width
else if (
$matches[1][$i] === 'src' &&
$matches[5][$i] === 'style' &&
$matches[7][$i] &&
$matches[7][$i] > $minimum) {
$thisSrc = $matches[4][$i];
$thisWidth = $matches[7][$i];
$fixed[$matches[4][$i]] = $matches[7][$i];
}
// width parameter AND style width
else if (strpos($matches[9][$i], 'width') !== false) {
preg_match('#
width\s*[:=]\s*
("|\')?
([0-9]+)
(px|\1)
#msix',
$matches[9][$i], $widthMatch);
if ($widthMatch[2] && $widthMatch[2] > $minimum) {
$thisSrc = $matches[9][$i];
$thisWidth = $widthMatch[2];
$fixed[$matches[9][$i]] = $widthMatch[2];
}
}
// style AND width parameters before src
else if (strpos($matches[9][$i], 'src') !== false) {
preg_match('#
src\s*=\s*
("|\')
([^\1]+)
\1
#msix',
$matches[9][$i], $srcMatch);
if ($srcMatch[2] &&
$matches[5][$i] === 'width' &&
$matches[8][$i] &&
$matches[8][$i] > 0) {
$thisSrc = $srcMatch[2];
$thisWidth = $matches[8][$i];
$fixed[$srcMatch[2]] = $matches[8][$i];
}
else if (
$srcMatch[2] &&
$matches[1][$i] === 'width' &&
$matches[4][$i] &&
$matches[4][$i] > $minimum) {
$thisSrc = $srcMatch[2];
$thisWidth = $matches[4][$i];
$fixed[$srcMatch[2]] = $matches[4][$i];
}
}
if ($thisWidth &&
$thisWidth < $finalWidth &&
$thisWidth > $minimum) {
$finalSrc = $thisSrc;
$finalWidth = $thisWidth;
}
}
// let's see all of the results
print_r($fixed);
echo <<<EOF
Smallest image:
$finalSrc, $finalWidth
EOF;
// Results:
Array
(
[http://example.com/aaa.jpg] => 230
[http://example.com/bbb.jpg] => 140
[http://example.com/ccc.jpg] => 150
[http://example.com/ddd.jpg] => 160
[http://example.com/eee.jpg] => 170
[http://example.com/fff.jpg] => 180
[http://example.com/ggg.jpg] => 190
[http://example.com/hhh.jpg] => 220
)
Smallest image:
http://example.com/bbb.jpg, 140 asort($fixed, 1);
foreach ($fixed as $finalSrc => $finalWidth) break; Seems like a simple HTML parser would be the way to go with this.
@phranque, can you suggest such a monster? I remember testing out a parser several years back and it was a nightmare, but I'm sure thing have improved since then.
// I set the variables to be fixed at the beginning of the benchmark,
// so setting they weren't part of the speed test
$string = <<<EOF
<img src="http://example.com/aaa.jpg" width="230" height="90" border="0">
<img width="140" height="100" src="http://example.com/bbb.jpg" border="0">
<img src="http://example.com/ccc.jpg" border="0" style="width: 150px">
<img src="http://example.com/ddd.jpg" width="160" height="120" border="0">
<img src="http://example.com/eee.jpg" data-foo data-bar="blah" width="170" border="0">
<img border="0" src="http://example.com/fff.jpg" width="180" height="140">
<img border="0" src="http://example.com/ggg.jpg" style="width: 190px; height: 150px" width="200" height="160">
<img style="width: 210px; height: 170px" width="220" height="180" border="0" src="http://example.com/hhh.jpg">
<img src="http://example.com/iii.jpg" width="60" height="20" border="0">
EOF;
$fixed = array();
$finalSrc = false;
$finalWidth = 500;
$minWidth = 80;
// this is where the speed test began
//
// get all of the <img ...> tags
preg_match_all('/<img [^>]+>/i',
$string, $matches);
// get the src, width, and style parameters
foreach($matches[0] as $imgParams) {
preg_match_all('#
(src|width|style)\s*=\s*
("|\')
(?:width\s*:\s*)?
([^\2]+?)
(?:
px
[^/2]*?
)?
\2
#mix',
$imgParams, $img);
$thisWidth = false;
$counter = count($img[1]);
unset($imgTag);
// not knowing the order of parameters, I chose to convert the array to
// an associative array, then I could use those keys to compare widths.
//
// I'm open to suggestions on any way to make this faster or better!
for ($i = 0; $i < $counter; $i++)
$imgTag[$img[1][$i]] = $img[3][$i];
// in case both style="width" and "width=" are listed, I'm giving the width
// parameter priority over style
if ($imgTag['width'] && $imgTag['width'] > $minWidth)
$thisWidth = $imgTag['width'];
// if width: is the first style then it's matched by preg_match_all above
else if ($imgTag['style'] && $imgTag['style'] > $minWidth)
$thisWidth = $imgTag['style'];
// if width: isn't the first style then it matches the whole style. In that
// case, let's split it up again
else if ($imgTag['style']) {
preg_match('#width\s*:\s*([0-9]+)#i',
$imgTag['style'], $styleArr);
if ($styleArr[1] > $minWidth)
$thisWidth = $styleArr[1];
}
if ($thisWidth)
$fixed[$imgTag['src']] = $thisWidth;
}
asort($fixed, 1);
foreach ($fixed as $finalSrc => $finalWidth) break;