* Copyright (C) 2024 MDW * Copyright (C) 2024 Frédéric France * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /** * \file htdocs/core/class/rssparser.class.php * \ingroup core * \brief File of class to parse RSS feeds */ // @phan-file-suppress PhanPluginPHPDocInWrongComment /** * Class to parse RSS files */ class RssParser { /** * @var DoliDB Database handler. */ public $db; /** * @var string Error code (or message) */ public $error = ''; public $feed_version; private $_format = ''; private $_urlRSS; private $_language; private $_generator; private $_copyright; private $_lastbuilddate; private $_imageurl; private $_link; private $_title; private $_description; private $_lastfetchdate; // Last successful fetch private $_rssarray = array(); private $current_namespace; public $items = array(); public $current_item = array(); public $channel = array(); public $textinput = array(); public $image = array(); private $initem; private $intextinput; private $incontent; private $inimage; private $inchannel; // For parsing with xmlparser public $stack = array(); // parser stack private $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright'); /** * Constructor * * @param DoliDB $db Database handler */ public function __construct($db) { $this->db = $db; } /** * getFormat * * @return string */ public function getFormat() { return $this->_format; } /** * getUrlRss * * @return string */ public function getUrlRss() { return $this->_urlRSS; } /** * getLanguage * * @return string */ public function getLanguage() { return $this->_language; } /** * getGenerator * * @return string */ public function getGenerator() { return $this->_generator; } /** * getCopyright * * @return string */ public function getCopyright() { return $this->_copyright; } /** * getLastBuildDate * * @return string */ public function getLastBuildDate() { return $this->_lastbuilddate; } /** * getImageUrl * * @return string */ public function getImageUrl() { return $this->_imageurl; } /** * getLink * * @return string */ public function getLink() { return $this->_link; } /** * getTitle * * @return string */ public function getTitle() { return $this->_title; } /** * getDescription * * @return string */ public function getDescription() { return $this->_description; } /** * getLastFetchDate * * @return int */ public function getLastFetchDate() { return $this->_lastfetchdate; } /** * getItems * * @return array */ public function getItems() { return $this->_rssarray; } /** * Parse rss URL * * @param string $urlRSS Url to parse * @param int $maxNb Max nb of records to get (0 for no limit) * @param int $cachedelay 0=No cache, nb of seconds we accept cache files (cachedir must also be defined) * @param string $cachedir Directory where to save cache file (For example $conf->externalrss->dir_temp) * @return int Return integer <0 if KO, >0 if OK */ public function parser($urlRSS, $maxNb = 0, $cachedelay = 60, $cachedir = '') { include_once DOL_DOCUMENT_ROOT.'/core/lib/files.lib.php'; include_once DOL_DOCUMENT_ROOT.'/core/lib/geturl.lib.php'; $rss = ''; $str = ''; // This will contain content of feed // Check parameters if (!dol_is_url($urlRSS)) { $this->error = "ErrorBadUrl"; return -1; } $this->_urlRSS = $urlRSS; $newpathofdestfile = $cachedir.'/'.dol_hash($this->_urlRSS, '3'); // Force md5 hash (does not contain special chars) $newmask = '0644'; //dol_syslog("RssParser::parser parse url=".$urlRSS." => cache file=".$newpathofdestfile); $nowgmt = dol_now(); // Search into cache $foundintocache = 0; if ($cachedelay > 0 && $cachedir) { $filedate = dol_filemtime($newpathofdestfile); if ($filedate >= ($nowgmt - $cachedelay)) { //dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is not older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we use it."); $foundintocache = 1; $this->_lastfetchdate = $filedate; } else { dol_syslog(get_class($this)."::parser cache file ".$newpathofdestfile." is not found or older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we can't use it."); } } // Load file into $str if ($foundintocache) { // Cache file found and is not too old $str = file_get_contents($newpathofdestfile); } else { try { $result = getURLContent($this->_urlRSS, 'GET', '', 1, array(), array('http', 'https'), 0); if (!empty($result['content'])) { $str = $result['content']; } elseif (!empty($result['curl_error_msg'])) { $this->error = 'Error retrieving URL '.$this->_urlRSS.' - '.$result['curl_error_msg']; return -1; } } catch (Exception $e) { $this->error = 'Error retrieving URL '.$this->_urlRSS.' - '.$e->getMessage(); return -2; } } if ($str !== false) { // Convert $str into xml if (getDolGlobalString('EXTERNALRSS_USE_SIMPLEXML')) { //print 'xx'.LIBXML_NOCDATA; libxml_use_internal_errors(false); if (LIBXML_VERSION < 20900) { // Avoid load of external entities (security problem). // Required only if LIBXML_VERSION < 20900 // @phan-suppress-next-line PhanDeprecatedFunctionInternal libxml_disable_entity_loader(true); } $rss = simplexml_load_string($str, "SimpleXMLElement", LIBXML_NOCDATA); } else { if (!function_exists('xml_parser_create')) { $this->error = 'Function xml_parser_create are not supported by your PHP'; return -1; } try { // @phan-suppress-next-line PhanTypeMismatchArgumentInternalProbablyReal $xmlparser = xml_parser_create(null); xml_parser_set_option($xmlparser, XML_OPTION_CASE_FOLDING, 0); xml_parser_set_option($xmlparser, XML_OPTION_SKIP_WHITE, 1); xml_parser_set_option($xmlparser, XML_OPTION_TARGET_ENCODING, "UTF-8"); //xml_set_external_entity_ref_handler($xmlparser, 'extEntHandler'); // Seems to have no effect even when function extEntHandler exists. if (!is_resource($xmlparser) && !is_object($xmlparser)) { $this->error = "ErrorFailedToCreateParser"; return -1; } xml_set_object($xmlparser, $this); // @phan-suppress-next-line PhanUndeclaredFunctionInCallable xml_set_element_handler($xmlparser, 'feed_start_element', 'feed_end_element'); // @phpstan-ignore-line // @phan-suppress-next-line PhanUndeclaredFunctionInCallable xml_set_character_data_handler($xmlparser, 'feed_cdata'); // @phpstan-ignore-line $status = xml_parse($xmlparser, $str, false); xml_parser_free($xmlparser); $rss = $this; //var_dump($status.' '.$rss->_format);exit; } catch (Exception $e) { $rss = null; } } } // If $rss loaded if ($rss) { // Save file into cache if (empty($foundintocache) && $cachedir) { dol_syslog(get_class($this)."::parser cache file ".$newpathofdestfile." is saved onto disk."); if (!dol_is_dir($cachedir)) { dol_mkdir($cachedir); } $fp = fopen($newpathofdestfile, 'w'); if ($fp) { fwrite($fp, $str); fclose($fp); dolChmod($newpathofdestfile); $this->_lastfetchdate = $nowgmt; } else { print 'Error, failed to open file '.$newpathofdestfile.' for write'; } } unset($str); // Free memory if (empty($rss->_format)) { // If format not detected automatically $rss->_format = 'rss'; if (empty($rss->channel)) { $rss->_format = 'atom'; } } $items = array(); // Save description entries if ($rss->_format == 'rss') { //var_dump($rss); if (getDolGlobalString('EXTERNALRSS_USE_SIMPLEXML')) { if (!empty($rss->channel->language)) { $this->_language = sanitizeVal((string) $rss->channel->language); } if (!empty($rss->channel->generator)) { $this->_generator = sanitizeVal((string) $rss->channel->generator); } if (!empty($rss->channel->copyright)) { $this->_copyright = sanitizeVal((string) $rss->channel->copyright); } if (!empty($rss->channel->lastbuilddate)) { $this->_lastbuilddate = sanitizeVal((string) $rss->channel->lastbuilddate); } if (!empty($rss->channel->image->url[0])) { $this->_imageurl = sanitizeVal((string) $rss->channel->image->url[0]); } if (!empty($rss->channel->link)) { $this->_link = sanitizeVal((string) $rss->channel->link); } if (!empty($rss->channel->title)) { $this->_title = sanitizeVal((string) $rss->channel->title); } if (!empty($rss->channel->description)) { $this->_description = sanitizeVal((string) $rss->channel->description); } } else { //var_dump($rss->channel); if (!empty($rss->channel['language'])) { $this->_language = sanitizeVal((string) $rss->channel['language']); } if (!empty($rss->channel['generator'])) { $this->_generator = sanitizeVal((string) $rss->channel['generator']); } if (!empty($rss->channel['copyright'])) { $this->_copyright = sanitizeVal((string) $rss->channel['copyright']); } if (!empty($rss->channel['lastbuilddate'])) { $this->_lastbuilddate = sanitizeVal((string) $rss->channel['lastbuilddate']); } if (!empty($rss->image['url'])) { $this->_imageurl = sanitizeVal((string) $rss->image['url']); } if (!empty($rss->channel['link'])) { $this->_link = sanitizeVal((string) $rss->channel['link']); } if (!empty($rss->channel['title'])) { $this->_title = sanitizeVal((string) $rss->channel['title']); } if (!empty($rss->channel['description'])) { $this->_description = sanitizeVal((string) $rss->channel['description']); } } if (getDolGlobalString('EXTERNALRSS_USE_SIMPLEXML')) { $items = $rss->channel->item; // With simplexml } else { $items = $rss->items; // With xmlparse } //var_dump($items);exit; } elseif ($rss->_format == 'atom') { //var_dump($rss); if (getDolGlobalString('EXTERNALRSS_USE_SIMPLEXML')) { if (!empty($rss->generator)) { $this->_generator = sanitizeVal((string) $rss->generator); } if (!empty($rss->lastbuilddate)) { $this->_lastbuilddate = sanitizeVal((string) $rss->modified); } if (!empty($rss->link->href)) { $this->_link = sanitizeVal((string) $rss->link->href); } if (!empty($rss->title)) { $this->_title = sanitizeVal((string) $rss->title); } if (!empty($rss->description)) { $this->_description = sanitizeVal((string) $rss->description); } } else { //if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language']; if (!empty($rss->channel['generator'])) { $this->_generator = sanitizeVal((string) $rss->channel['generator']); } //if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright']; if (!empty($rss->channel['modified'])) { $this->_lastbuilddate = sanitizeVal((string) $rss->channel['modified']); } //if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url']; if (!empty($rss->channel['link'])) { $this->_link = sanitizeVal((string) $rss->channel['link']); } if (!empty($rss->channel['title'])) { $this->_title = sanitizeVal((string) $rss->channel['title']); } //if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description']; if (!empty($rss->channel)) { $this->_imageurl = sanitizeVal($this->getAtomImageUrl($rss->channel)); } } if (getDolGlobalString('EXTERNALRSS_USE_SIMPLEXML')) { $tmprss = xml2php($rss); $items = $tmprss['entry']; } else { // With simplexml $items = $rss->items; // With xmlparse } //var_dump($items);exit; } $i = 0; // Loop on each record if (is_array($items)) { foreach ($items as $item) { //var_dump($item);exit; if ($rss->_format == 'rss') { if (getDolGlobalString('EXTERNALRSS_USE_SIMPLEXML')) { $itemLink = sanitizeVal((string) $item->link); $itemTitle = sanitizeVal((string) $item->title); $itemDescription = sanitizeVal((string) $item->description); $itemPubDate = sanitizeVal((string) $item->pubDate); $itemId = ''; $itemAuthor = ''; } else { $itemLink = sanitizeVal((string) $item['link']); $itemTitle = sanitizeVal((string) $item['title']); $itemDescription = sanitizeVal((string) $item['description']); $itemPubDate = sanitizeVal((string) $item['pubdate']); $itemId = sanitizeVal((string) $item['guid']); $itemAuthor = sanitizeVal((string) ($item['author'] ?? '')); } // Loop on each category $itemCategory = array(); if (!empty($item->category) && is_array($item->category)) { foreach ($item->category as $cat) { $itemCategory[] = (string) $cat; } } } elseif ($rss->_format == 'atom') { $itemLink = (isset($item['link']) ? sanitizeVal((string) $item['link']) : ''); $itemTitle = sanitizeVal((string) $item['title']); $itemDescription = sanitizeVal($this->getAtomItemDescription($item)); $itemPubDate = sanitizeVal((string) $item['created']); $itemId = sanitizeVal((string) $item['id']); $itemAuthor = sanitizeVal((string) ($item['author'] ? $item['author'] : $item['author_name'])); $itemCategory = array(); } else { $itemLink = ''; $itemTitle = ''; $itemDescription = ''; $itemPubDate = ''; $itemId = ''; $itemAuthor = ''; $itemCategory = array(); print 'ErrorBadFeedFormat'; } // Add record to result array $this->_rssarray[$i] = array( 'link' => $itemLink, 'title' => $itemTitle, 'description' => $itemDescription, 'pubDate' => $itemPubDate, 'category' => $itemCategory, 'id' => $itemId, 'author' => $itemAuthor ); //var_dump($this->_rssarray); $i++; if ($i > $maxNb) { break; // We get all records we want } } } return 1; } else { $this->error = 'ErrorFailedToLoadRSSFile'; return -1; } } // phpcs:disable PEAR.NamingConventions.ValidFunctionName.ScopeNotCamelCaps /** * Triggered when opened tag is found * * @param string $p Start * @param string $element Tag * @param array $attrs Attributes of tags * @return void */ public function feed_start_element($p, $element, $attrs) { // phpcs:enable $el = $element = strtolower($element); $attrs = array_change_key_case($attrs, CASE_LOWER); // check for a namespace, and split if found $ns = false; if (strpos($element, ':')) { list($ns, $el) = explode(':', $element, 2); } if ($ns and $ns != 'rdf') { $this->current_namespace = $ns; } // if feed type isn't set, then this is first element of feed identify feed from root element if (empty($this->_format)) { if ($el == 'rdf') { $this->_format = 'rss'; $this->feed_version = '1.0'; } elseif ($el == 'rss') { $this->_format = 'rss'; $this->feed_version = $attrs['version']; } elseif ($el == 'feed') { $this->_format = 'atom'; $this->feed_version = $attrs['version']; $this->inchannel = true; } return; } if ($el == 'channel') { $this->inchannel = true; } elseif ($el == 'item' || $el == 'entry') { $this->initem = true; if (isset($attrs['rdf:about'])) { $this->current_item['about'] = $attrs['rdf:about']; } } elseif ($this->_format == 'rss' && $this->current_namespace == '' && $el == 'textinput') { // if we're in the default namespace of an RSS feed, // record textinput or image fields $this->intextinput = true; } elseif ($this->_format == 'rss' && $this->current_namespace == '' && $el == 'image') { $this->inimage = true; } elseif ($this->_format == 'atom' && in_array($el, $this->_CONTENT_CONSTRUCTS)) { // handle atom content constructs // avoid clashing w/ RSS mod_content if ($el == 'content') { $el = 'atom_content'; } $this->incontent = $el; } elseif ($this->_format == 'atom' && $this->incontent) { // if inside an Atom content construct (e.g. content or summary) field treat tags as text // if tags are inlined, then flatten $attrs_str = implode(' ', array_map('rss_map_attrs', array_keys($attrs), array_values($attrs))); $this->append_content("<$element $attrs_str>"); array_unshift($this->stack, $el); } elseif ($this->_format == 'atom' && $el == 'link') { // Atom support many links per containing element. // Magpie treats link elements of type rel='alternate' // as being equivalent to RSS's simple link element. if (isset($attrs['rel']) && $attrs['rel'] == 'alternate') { $link_el = 'link'; } elseif (!isset($attrs['rel'])) { $link_el = 'link'; } else { $link_el = 'link_'.$attrs['rel']; } $this->append($link_el, $attrs['href']); } else { // set stack[0] to current element array_unshift($this->stack, $el); } } // phpcs:disable PEAR.NamingConventions.ValidFunctionName.ScopeNotCamelCaps /** * Triggered when CDATA is found * * @param string $p P * @param string $text Tag * @return void */ public function feed_cdata($p, $text) { // phpcs:enable if ($this->_format == 'atom' and $this->incontent) { $this->append_content($text); } else { $current_el = implode('_', array_reverse($this->stack)); $this->append($current_el, $text); } } // phpcs:disable PEAR.NamingConventions.ValidFunctionName.ScopeNotCamelCaps /** * Triggered when closed tag is found * * @param string $p P * @param string $el Tag * @return void */ public function feed_end_element($p, $el) { // phpcs:enable $el = strtolower($el); if ($el == 'item' or $el == 'entry') { $this->items[] = $this->current_item; $this->current_item = array(); $this->initem = false; } elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'textinput') { $this->intextinput = false; } elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'image') { $this->inimage = false; } elseif ($this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS)) { $this->incontent = false; } elseif ($el == 'channel' or $el == 'feed') { $this->inchannel = false; } elseif ($this->_format == 'atom' and $this->incontent) { // balance tags properly // note: i don't think this is actually necessary if ($this->stack[0] == $el) { $this->append_content(""); } else { $this->append_content("<$el />"); } array_shift($this->stack); } else { array_shift($this->stack); } $this->current_namespace = false; } /** * To concat 2 strings with no warning if an operand is not defined * * @param string $str1 Str1 * @param string $str2 Str2 * @return string String cancatenated */ public function concat(&$str1, $str2 = "") { if (!isset($str1)) { $str1 = ""; } $str1 .= $str2; return $str1; } // phpcs:disable PEAR.NamingConventions.ValidFunctionName.ScopeNotCamelCaps /** * Enter description here ... * * @param string $text Text * @return void */ public function append_content($text) { // phpcs:enable if (!empty($this->initem)) { $this->concat($this->current_item[$this->incontent], $text); } elseif (!empty($this->inchannel)) { $this->concat($this->channel[$this->incontent], $text); } } /** * smart append - field and namespace aware * * @param string $el El * @param string $text Text * @return void */ public function append($el, $text) { if (!$el) { return; } if (!empty($this->current_namespace)) { if (!empty($this->initem)) { $this->concat($this->current_item[$this->current_namespace][$el], $text); } elseif (!empty($this->inchannel)) { $this->concat($this->channel[$this->current_namespace][$el], $text); } elseif (!empty($this->intextinput)) { $this->concat($this->textinput[$this->current_namespace][$el], $text); } elseif (!empty($this->inimage)) { $this->concat($this->image[$this->current_namespace][$el], $text); } } else { if (!empty($this->initem)) { $this->concat($this->current_item[$el], $text); } elseif (!empty($this->intextinput)) { $this->concat($this->textinput[$el], $text); } elseif (!empty($this->inimage)) { $this->concat($this->image[$el], $text); } elseif (!empty($this->inchannel)) { $this->concat($this->channel[$el], $text); } } } /** * Return a description/summary for one item from a ATOM feed * * @param array $item A parsed item of a ATOM feed * @param int $maxlength (optional) The maximum length for the description * @return string A summary description */ private function getAtomItemDescription(array $item, $maxlength = 500) { $result = ""; if (isset($item['summary'])) { $result = $item['summary']; } elseif (isset($item['atom_content'])) { $result = $item['atom_content']; } // remove all HTML elements that can possible break the maximum size of a tooltip, // like headings, image, video etc. and allow only simple style elements $result = strip_tags($result, "

    1. "); $result = str_replace("\n", "", $result); if (strlen($result) > $maxlength) { $result = substr($result, 0, $maxlength); $result .= "..."; } return $result; } /** * Return a URL to a image of the given ATOM feed * * @param array $feed The ATOM feed that possible contain a link to a logo or icon * @return string A URL to a image from a ATOM feed when found, otherwise a empty string */ private function getAtomImageUrl(array $feed) { if (isset($feed['icon'])) { return $feed['logo']; } if (isset($feed['icon'])) { return $feed['logo']; } if (isset($feed['webfeeds:logo'])) { return $feed['webfeeds:logo']; } if (isset($feed['webfeeds:icon'])) { return $feed['webfeeds:icon']; } if (isset($feed['webfeeds:wordmark'])) { return $feed['webfeeds:wordmark']; } return ""; } } /* * A method for the xml_set_external_entity_ref_handler() * * @param XMLParser $parser * @param string $ent * @param string|false $base * @param string $sysID * @param string|false $pubID * @return bool function extEntHandler($parser, $ent, $base, $sysID, $pubID) { print 'extEntHandler ran'; return true; } */ /** * Function to convert an XML object into an array * * @param string $k Key * @param string $v Value * @return string */ function rss_map_attrs($k, $v) { return "$k=\"$v\""; } /** * Function to convert an XML object into an array * * @param SimpleXMLElement $xml Xml * @return array|string */ function xml2php($xml) { $threads = 0; $tab = false; $array = array(); foreach ($xml->children() as $key => $value) { $child = xml2php($value); //To deal with the attributes foreach ($value->attributes() as $ak => $av) { $child[$ak] = (string) $av; } //Let see if the new child is not in the array if ($tab === false && in_array($key, array_keys($array))) { //If this element is already in the array we will create an indexed array $tmp = $array[$key]; $array[$key] = null; $array[$key][] = $tmp; $array[$key][] = $child; $tab = true; } elseif ($tab === true) { //Add an element in an existing array $array[$key][] = $child; } else { //Add a simple element $array[$key] = $child; } $threads++; } if ($threads == 0) { return (string) $xml; } return $array; }