Linkfield

Like images this is another set of data that was embedded in the content, but was far more interesting if it could be extracted and added into it's CCK field. As CCK fields go it's pretty simple, as extraction goes, well this isn't going to be perfect, but it was good enough:

function _x_node_ext_links($text) {
  $links = array();

  /**
   * Permissive test
   *
  if (preg_match_all('#\[url(.*?)\](.*?)\[/url#si', $text, $matches)) {
    var_export($matches);
  }
  //*/


  // BBCode
  if (preg_match_all('#\[url=([\w:;&,%+~!=@\/\.\-\#\?]+?)\](.*?)\[/url(?::\w+)?\]#si', $text, $matches, PREG_SET_ORDER)) {
    foreach ($matches as $url) {
      // avoid displaying false www. addresses
      preg_replace('#\[\w:;&,%+~!=@\/\.\-\#\?]+?)#si', '', $url[2]);
      if ($url[2] == '') { $url[2] = $url[1]; }
      // prefix http:// if no scheme
      if (! preg_match('#^\w*://#', $url[1]) ) { $url[1] = 'http://'. $url[1]; }
      $links[] = array('url' => $url[1], 'title' => $url[2], 'attributes' => 'N;',);
    }
  }

  // Match absolute URLs.
  if (preg_match_all("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", $text, $matches, PREG_SET_ORDER)) {
    foreach ($matches as $url) {
      $links[] = array('url' => $url[2], 'title' => $url[2], 'attributes' => 'N;',);
    }
  }

  // Match www domains/addresses.
  if (preg_match_all("`(<p>|<li>|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", $text, $matches, PREG_SET_ORDER)) {
    foreach ($matches as $url) {
      // prefix http:// as no scheme
      $url[2] = 'http://'. $url[2];
      $links[] = array('url' => $url[2], 'title' => $url[2], 'attributes' => 'N;',);
    }
  }
 
  if (count($links)) {
    print ' - <strong>retrieved links:</strong> ';
    foreach ($links as $link) {
      print $link['url'] .', ';
    }
  }
 
  return $links;
}