Linkfield
Like images this is another set of data that was embedded in the content, but was far more interesting if it could be extracted and added into it's CCK field. As CCK fields go it's pretty simple, as extraction goes, well this isn't going to be perfect, but it was good enough:
function _x_node_ext_links($text) {
$links = array();
/**
* Permissive test
*
if (preg_match_all('#\[url(.*?)\](.*?)\[/url#si', $text, $matches)) {
var_export($matches);
}
//*/
// BBCode
if (preg_match_all('#\[url=([\w:;&,%+~!=@\/\.\-\#\?]+?)\](.*?)\[/url(?::\w+)?\]#si', $text, $matches, PREG_SET_ORDER)) {
foreach ($matches as $url) {
// avoid displaying false www. addresses
preg_replace('#\[\w:;&,%+~!=@\/\.\-\#\?]+?)#si', '', $url[2]);
if ($url[2] == '') { $url[2] = $url[1]; }
// prefix http:// if no scheme
if (! preg_match('#^\w*://#', $url[1]) ) { $url[1] = 'http://'. $url[1]; }
$links[] = array('url' => $url[1], 'title' => $url[2], 'attributes' => 'N;',);
}
}
// Match absolute URLs.
if (preg_match_all("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", $text, $matches, PREG_SET_ORDER)) {
foreach ($matches as $url) {
$links[] = array('url' => $url[2], 'title' => $url[2], 'attributes' => 'N;',);
}
}
// Match www domains/addresses.
if (preg_match_all("`(<p>|<li>|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", $text, $matches, PREG_SET_ORDER)) {
foreach ($matches as $url) {
// prefix http:// as no scheme
$url[2] = 'http://'. $url[2];
$links[] = array('url' => $url[2], 'title' => $url[2], 'attributes' => 'N;',);
}
}
if (count($links)) {
print ' - <strong>retrieved links:</strong> ';
foreach ($links as $link) {
print $link['url'] .', ';
}
}
return $links;
}
$links = array();
/**
* Permissive test
*
if (preg_match_all('#\[url(.*?)\](.*?)\[/url#si', $text, $matches)) {
var_export($matches);
}
//*/
// BBCode
if (preg_match_all('#\[url=([\w:;&,%+~!=@\/\.\-\#\?]+?)\](.*?)\[/url(?::\w+)?\]#si', $text, $matches, PREG_SET_ORDER)) {
foreach ($matches as $url) {
// avoid displaying false www. addresses
preg_replace('#\[\w:;&,%+~!=@\/\.\-\#\?]+?)#si', '', $url[2]);
if ($url[2] == '') { $url[2] = $url[1]; }
// prefix http:// if no scheme
if (! preg_match('#^\w*://#', $url[1]) ) { $url[1] = 'http://'. $url[1]; }
$links[] = array('url' => $url[1], 'title' => $url[2], 'attributes' => 'N;',);
}
}
// Match absolute URLs.
if (preg_match_all("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", $text, $matches, PREG_SET_ORDER)) {
foreach ($matches as $url) {
$links[] = array('url' => $url[2], 'title' => $url[2], 'attributes' => 'N;',);
}
}
// Match www domains/addresses.
if (preg_match_all("`(<p>|<li>|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", $text, $matches, PREG_SET_ORDER)) {
foreach ($matches as $url) {
// prefix http:// as no scheme
$url[2] = 'http://'. $url[2];
$links[] = array('url' => $url[2], 'title' => $url[2], 'attributes' => 'N;',);
}
}
if (count($links)) {
print ' - <strong>retrieved links:</strong> ';
foreach ($links as $link) {
print $link['url'] .', ';
}
}
return $links;
}
