From d6620bfbdbc2e0fca69ad46c703606b75ca31602 Mon Sep 17 00:00:00 2001 From: Eugene Yurkevich Date: Fri, 17 Mar 2017 20:51:58 +0300 Subject: [PATCH] Refactoring RobotsTxtParser to significantly improve performance (100-1000x) --- RobotsTxtParser.php | 430 ++++++++++--------------------------- tests/NoIndexTest.php | 10 +- tests/RelativePathTest.php | 4 +- 3 files changed, 122 insertions(+), 322 deletions(-) diff --git a/RobotsTxtParser.php b/RobotsTxtParser.php index f006d82..34724f3 100644 --- a/RobotsTxtParser.php +++ b/RobotsTxtParser.php @@ -5,8 +5,7 @@ * * @author Eugene Yurkevich (bopodaa@gmail.com) * - * - * Some useful links and materials: + * Useful links and materials about robots.txt crawling * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt * @link https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml */ @@ -15,15 +14,8 @@ class RobotsTxtParser // default encoding const DEFAULT_ENCODING = 'UTF-8'; - // states - const STATE_ZERO_POINT = 'zero-point'; - const STATE_READ_DIRECTIVE = 'read-directive'; - const STATE_SKIP_SPACE = 'skip-space'; - const STATE_SKIP_LINE = 'skip-line'; - const STATE_READ_VALUE = 'read-value'; - // directives - const DIRECTIVE_NOINDEX = 'noindex'; + const DIRECTIVE_NOINDEX = 'noindex'; const DIRECTIVE_ALLOW = 'allow'; const DIRECTIVE_DISALLOW = 'disallow'; const DIRECTIVE_HOST = 'host'; @@ -31,49 +23,45 @@ class RobotsTxtParser const DIRECTIVE_USERAGENT = 'user-agent'; const DIRECTIVE_CRAWL_DELAY = 'crawl-delay'; const DIRECTIVE_CLEAN_PARAM = 'clean-param'; - - /** - * Default user-agent - * First off, links should be checked by specific user-agent rules. If specific user-agent isn't specified than default user-agent used. - */ - const USER_AGENT_ALL = '*'; - // current state - private $state = ''; + //default user-agent + const USER_AGENT_ALL = '*'; - // robots.txt file content + /** + * @var string $content Original robots.txt content + */ private $content = ''; - // rules set - private $rules = array(); + /** + * @var array $rules Rules with all parsed directives by all user-agents + */ + private $rules = []; + + /** + * @var string $currentDirective Current directive + */ + private $currentDirective; - // internally used variables - private $current_word = ''; - private $current_char = ''; - private $char_index = 0; - private $current_directive = ''; - private $userAgent = self::USER_AGENT_ALL; + /** + * @var string $userAgent Current user-agent + */ + private $userAgent; /** - * @param string $content - file content - * @param string $encoding - encoding + * @param string $content Robots.txt content + * @param string $encoding Encoding * @return RobotsTxtParser */ public function __construct($content, $encoding = self::DEFAULT_ENCODING) { // convert encoding $encoding = !empty($encoding) ? $encoding : mb_detect_encoding($content, mb_detect_order(), false); - if ($encoding == "UTF-8") { - $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8'); - } + if ($encoding == "UTF-8") { + $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8'); + } // set content - $this->content = iconv(mb_detect_encoding($content, mb_detect_order(), false), "UTF-8//IGNORE", $content); - $this->content .= "\n"; - - // set default state - $this->state = self::STATE_ZERO_POINT; + $this->content = iconv(mb_detect_encoding($content, mb_detect_order(), false), "UTF-8//IGNORE", $content); - // parse rules - default state $this->prepareRules(); } @@ -98,7 +86,7 @@ public function getRules($userAgent = NULL) return $this->rules[$userAgent]; } else { - return array(); + return []; } } } @@ -107,7 +95,7 @@ public function getRules($userAgent = NULL) * Get sitemaps links. * Sitemap always relates to all user-agents and return in rules with user-agent "*" * - * @return array all sitemap urls + * @return array */ public function getSitemaps() { @@ -116,329 +104,141 @@ public function getSitemaps() return $rules[self::DIRECTIVE_SITEMAP]; } - return array(); - } - - public function getContent() - { - return $this->content; - } - - /** - * NoIndex directive signal - */ - private function directiveNoIndex() - { - return ($this->current_directive == self::DIRECTIVE_NOINDEX); - } - - /** - * Comment signal (#) - */ - private function sharp() - { - return ($this->current_char == '#'); - } - - /** - * Allow directive signal - */ - private function directiveAllow() - { - return ($this->current_directive == self::DIRECTIVE_ALLOW); - } - - /** - * Disallow directive signal - */ - private function directiveDisallow() - { - return ($this->current_directive == self::DIRECTIVE_DISALLOW); - } - - /** - * Host directive signal - */ - private function directiveHost() - { - return ($this->current_directive == self::DIRECTIVE_HOST); - } - - /** - * Sitemap directive signal - */ - private function directiveSitemap() - { - return ($this->current_directive == self::DIRECTIVE_SITEMAP); - } - - /** - * Clean-param directive signal - */ - private function directiveCleanParam() - { - return ($this->current_directive == self::DIRECTIVE_CLEAN_PARAM); - } - - /** - * User-agent directive signal - */ - private function directiveUserAgent() - { - return ($this->current_directive == self::DIRECTIVE_USERAGENT); - } - - /** - * Crawl-Delay directive signal - */ - private function directiveCrawlDelay() - { - return ($this->current_directive == self::DIRECTIVE_CRAWL_DELAY); - } - - /** - * Key : value pair separator signal - */ - private function lineSeparator() - { - return ($this->current_char == ':'); - } - - /** - * Move to new line signal - */ - private function newLine() - { - $asciiCode = ord($this->current_char); - - return ($this->current_char == "\n" - || $asciiCode == 13 - || $asciiCode == 10 - || $this->current_word == "\r\n" - || $this->current_word == "\n\r" - ); - } - - /** - * "Space" signal - */ - private function space() - { - return ($this->current_char == "\s"); + return []; } /** - * Change state + * Return original robots.txt content * - * @param string $stateTo - state that should be set - * @return void + * @return string */ - private function switchState($stateTo = self::STATE_SKIP_LINE) + public function getContent() { - $this->state = $stateTo; + return $this->content; } /** - * Parse rules + * Return array of supported directives * - * @return void - */ - public function prepareRules() - { - $contentLength = mb_strlen($this->content); - while ($this->char_index <= $contentLength) { - $this->step(); - } - - foreach ($this->rules as $userAgent => $directive) { - foreach ($directive as $directiveName => $directiveValue) { - if (is_array($directiveValue)) { - $this->rules[$userAgent][$directiveName] = array_values(array_unique($directiveValue)); - } - } - } - } - - /** - * Check if we should switch - * @return bool + * @return array */ - private function shouldSwitchToZeroPoint() + private function getAllowedDirectives() { - return in_array(strtolower($this->current_word), array( - self::DIRECTIVE_NOINDEX, + return [ + self::DIRECTIVE_NOINDEX, self::DIRECTIVE_ALLOW, self::DIRECTIVE_DISALLOW, self::DIRECTIVE_HOST, - self::DIRECTIVE_USERAGENT, self::DIRECTIVE_SITEMAP, + self::DIRECTIVE_USERAGENT, self::DIRECTIVE_CRAWL_DELAY, - self::DIRECTIVE_CLEAN_PARAM, - ), true); - } - - /** - * Process state ZERO_POINT - * @return RobotsTxtParser - */ - private function zeroPoint() - { - if ($this->shouldSwitchToZeroPoint()) { - $this->switchState(self::STATE_READ_DIRECTIVE); - } // unknown directive - skip it - elseif ($this->newLine()) { - $this->current_word = ""; - $this->increment(); - } - else { - $this->increment(); - } - return $this; + self::DIRECTIVE_CLEAN_PARAM, + ]; } /** - * Read directive - * @return RobotsTxtParser + * Parse rules + * + * @return void */ - private function readDirective() + private function prepareRules() { - $this->current_directive = strtolower(trim($this->current_word)); - - $this->increment(); + $rows = explode(PHP_EOL, $this->content); - if ($this->lineSeparator()) { - $this->current_word = ""; - $this->switchState(self::STATE_READ_VALUE); - } - else { - if ($this->space()) { - $this->switchState(self::STATE_SKIP_SPACE); - } - if ($this->sharp()) { - $this->switchState(self::STATE_SKIP_LINE); + foreach ($rows as $row) { + $row = preg_replace('/#.*/', '', $row); + $parts = explode(':', $row, 2); + if (count($parts) < 2) { + continue; } - } - return $this; - } - - /** - * Skip space - * @return RobotsTxtParser - */ - private function skipSpace() - { - $this->char_index++; - $this->current_word = mb_substr($this->current_word, -1); - return $this; - } - /** - * Skip line - * @return RobotsTxtParser - */ - private function skipLine() - { - $this->char_index++; - $this->switchState(self::STATE_ZERO_POINT); - return $this; - } + $directive = trim(strtolower($parts[0])); + $value = trim($parts[1]); - /** - * Read value - * @return RobotsTxtParser - */ - private function readValue() - { - if ($this->newLine()) { - $this->assignValueToDirective(); - } - elseif ($this->sharp()) { - $this->current_word = mb_substr($this->current_word, 0, -1); - $this->assignValueToDirective(); - } - else { - $this->increment(); + $this->handleDirective($directive, $value); } - return $this; + + $this->removeDublicates(); } - private function assignValueToDirective() + private function removeDublicates() { - if ($this->directiveUserAgent()) { - $this->userAgent = mb_strtolower(trim($this->current_word)); - if (!isset($this->rules[$this->userAgent])) { - $this->rules[$this->userAgent] = array(); - } - } - elseif ($this->directiveCrawlDelay()) { - $this->rules[$this->userAgent][$this->current_directive] = (double)$this->current_word; - } - elseif ($this->directiveSitemap()) { - $this->rules[self::USER_AGENT_ALL][$this->current_directive][] = $this->current_word; - } - elseif ($this->directiveCleanParam()) { - $this->rules[$this->userAgent][$this->current_directive][] = trim($this->current_word); - } - elseif ($this->directiveHost()) { - if (empty($this->rules['*'][$this->current_directive])) { // save only first host directive value, assign to '*' - $this->rules['*'][$this->current_directive] = $this->current_word; - } - } - elseif ($this->directiveNoIndex()) { - $this->rules[$this->userAgent][$this->current_directive][] = trim($this->current_word); - } - else { - if (!empty($this->current_word)) { - $this->rules[$this->userAgent][$this->current_directive][] = $this->current_word; + foreach ($this->rules as $userAgent => $rules) { + foreach ($this->rules[$userAgent] as $directive => $value) { + if (is_array($this->rules[$userAgent][$directive])) { + $this->rules[$userAgent][$directive] = array_values(array_unique($this->rules[$userAgent][$directive])); + } } } - $this->current_word = ''; - $this->current_directive = ''; - $this->switchState(self::STATE_ZERO_POINT); } /** - * Machine step + * Handle directive with value + * Assign value to directive * - * @return void + * @param string $directive + * @param string $value */ - private function step() + private function handleDirective($directive, $value) { - switch ($this->state) { - case self::STATE_ZERO_POINT: - $this->zeroPoint(); + if (!in_array($directive, $this->getAllowedDirectives())) { + return; + } + + switch ($directive) { + case self::DIRECTIVE_USERAGENT: + $this->currentDirective = $directive; + $this->userAgent = strtolower($value); + + if (!isset($this->rules[$this->userAgent])) { + $this->rules[$this->userAgent] = []; + } + break; - case self::STATE_READ_DIRECTIVE: - $this->readDirective(); + case self::DIRECTIVE_DISALLOW: + $this->currentDirective = $directive; + + if ($this->userAgent && $value) { + $this->rules[$this->userAgent][self::DIRECTIVE_DISALLOW][] = $value; + } + break; + case self::DIRECTIVE_CRAWL_DELAY: + $this->currentDirective = $directive; + + if ($this->userAgent && $value) { + $this->rules[$this->userAgent][self::DIRECTIVE_CRAWL_DELAY] = (double) $value; + } - case self::STATE_SKIP_SPACE: - $this->skipSpace(); break; - case self::STATE_SKIP_LINE: - $this->skipLine(); + case self::DIRECTIVE_SITEMAP: + $this->currentDirective = $directive; + + if ($value) { + $this->rules[self::USER_AGENT_ALL][self::DIRECTIVE_SITEMAP][] = $value; + } + break; - case self::STATE_READ_VALUE: - $this->readValue(); + case self::DIRECTIVE_HOST: + $this->currentDirective = $directive; + + if ($value && empty($this->rules[self::USER_AGENT_ALL][self::DIRECTIVE_HOST])) { + $this->rules[self::USER_AGENT_ALL][self::DIRECTIVE_HOST] = $value; + } + break; - } - } - /** - * Move to the following step - * - * @return void - */ - private function increment() - { - $this->current_char = mb_substr($this->content, $this->char_index, 1); - $this->current_word .= $this->current_char; - if (!$this->directiveCleanParam() && !$this->directiveUserAgent()) { - $this->current_word = trim($this->current_word); + default: + $this->currentDirective = $directive; + + if (!empty($this->userAgent)) { + $this->rules[$this->userAgent][$this->currentDirective][] = $value; + } + + break; } - $this->char_index++; } } diff --git a/tests/NoIndexTest.php b/tests/NoIndexTest.php index 20fede0..6dcd7af 100644 --- a/tests/NoIndexTest.php +++ b/tests/NoIndexTest.php @@ -21,7 +21,7 @@ public function testNoIndex($robotsTxtContent) $rules = $parser->getRules(); $this->assertArrayHasKey('*', $rules); $this->assertArrayHasKey('noindex', $rules['*']); - $this->assertEquals(2, count($rules['*']['noindex']), 'wrong noindex directive count'); + $this->assertEquals(2, count($rules['*']['noindex']), 'wrong noindex directive count'); } /** @@ -32,10 +32,10 @@ public function generateDataForTest() { return array( array(" - User-agent: * - Noindex: /page-a.html - Noindex: /article-* - ") + User-agent: * + Noindex: /page-a.html + Noindex: /article-* + ") ); } } diff --git a/tests/RelativePathTest.php b/tests/RelativePathTest.php index c6742f1..450bfcf 100644 --- a/tests/RelativePathTest.php +++ b/tests/RelativePathTest.php @@ -8,7 +8,7 @@ class RelativePathTest extends \PHPUnit\Framework\TestCase public static function setUpBeforeClass() { require_once(realpath(__DIR__.'/../RobotsTxtParser.php')); - require_once(realpath(__DIR__.'/../RobotsTxtValidator.php')); + require_once(realpath(__DIR__.'/../RobotsTxtValidator.php')); } /** @@ -47,6 +47,6 @@ public function generateDataForTest() User-agent: * Disallow: /*/?replytocom=* ") - ); + ); } }