* @author Nick Pope * @copyright Copyright © 2010, Mike Cochrane, Nick Pope * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0 */ namespace App\Util\Lexer; /** * Twitter Extractor Class. * * Parses tweets and extracts URLs, usernames, username/list pairs and * hashtags. * * Originally written by {@link http://github.com/mikenz Mike Cochrane}, this * is based on code by {@link http://github.com/mzsanford Matt Sanford} and * heavily modified by {@link http://github.com/ngnpope Nick Pope}. * * @author Mike Cochrane * @author Nick Pope * @copyright Copyright © 2010, Mike Cochrane, Nick Pope * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0 */ class Extractor extends Regex { /** * @var bool */ protected $extractURLWithoutProtocol = true; /** * Provides fluent method chaining. * * @param string $tweet The tweet to be converted. * * @see __construct() * * @return Extractor */ public static function create($tweet = null) { return new self($tweet); } /** * Reads in a tweet to be parsed and extracts elements from it. * * Extracts various parts of a tweet including URLs, usernames, hashtags... * * @param string $tweet The tweet to extract. */ public function __construct($tweet = null) { parent::__construct($tweet); } /** * Extracts all parts of a tweet and returns an associative array containing * the extracted elements. * * @param string $tweet The tweet to extract. * * @return array The elements in the tweet. */ public function extract($tweet = null) { if (is_null($tweet)) { $tweet = $this->tweet; } return [ 'hashtags' => $this->extractHashtags($tweet), 'urls' => $this->extractURLs($tweet), 'mentions' => $this->extractMentionedUsernames($tweet), 'replyto' => $this->extractRepliedUsernames($tweet), 'hashtags_with_indices' => $this->extractHashtagsWithIndices($tweet), 'urls_with_indices' => $this->extractURLsWithIndices($tweet), 'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices($tweet), ]; } /** * Extract URLs, @mentions, lists and #hashtag from a given text/tweet. * * @param string $tweet The tweet to extract. * * @return array list of extracted entities */ public function extractEntitiesWithIndices($tweet = null) { if (is_null($tweet)) { $tweet = $this->tweet; } $entities = []; $entities = array_merge($entities, $this->extractURLsWithIndices($tweet)); $entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false)); $entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet)); $entities = array_merge($entities, $this->extractCashtagsWithIndices($tweet)); $entities = $this->removeOverlappingEntities($entities); return $entities; } /** * Extracts all the hashtags from the tweet. * * @param string $tweet The tweet to extract. * * @return array The hashtag elements in the tweet. */ public function extractHashtags($tweet = null) { $hashtagsOnly = []; $hashtagsWithIndices = $this->extractHashtagsWithIndices($tweet); foreach ($hashtagsWithIndices as $hashtagWithIndex) { $hashtagsOnly[] = $hashtagWithIndex['hashtag']; } return $hashtagsOnly; } /** * Extracts all the cashtags from the tweet. * * @param string $tweet The tweet to extract. * * @return array The cashtag elements in the tweet. */ public function extractCashtags($tweet = null) { $cashtagsOnly = []; $cashtagsWithIndices = $this->extractCashtagsWithIndices($tweet); foreach ($cashtagsWithIndices as $cashtagWithIndex) { $cashtagsOnly[] = $cashtagWithIndex['cashtag']; } return $cashtagsOnly; } /** * Extracts all the URLs from the tweet. * * @param string $tweet The tweet to extract. * * @return array The URL elements in the tweet. */ public function extractURLs($tweet = null) { $urlsOnly = []; $urlsWithIndices = $this->extractURLsWithIndices($tweet); foreach ($urlsWithIndices as $urlWithIndex) { $urlsOnly[] = $urlWithIndex['url']; } return $urlsOnly; } /** * Extract all the usernames from the tweet. * * A mention is an occurrence of a username anywhere in a tweet. * * @param string $tweet The tweet to extract. * * @return array The usernames elements in the tweet. */ public function extractMentionedScreennames($tweet = null) { $usernamesOnly = []; $mentionsWithIndices = $this->extractMentionsOrListsWithIndices($tweet); foreach ($mentionsWithIndices as $mentionWithIndex) { $screen_name = mb_strtolower($mentionWithIndex['screen_name']); if (empty($screen_name) or in_array($screen_name, $usernamesOnly)) { continue; } $usernamesOnly[] = $screen_name; } return $usernamesOnly; } /** * Extract all the usernames from the tweet. * * A mention is an occurrence of a username anywhere in a tweet. * * @return array The usernames elements in the tweet. * * @deprecated since version 1.1.0 */ public function extractMentionedUsernames($tweet) { $this->tweet = $tweet; return $this->extractMentionedScreennames($tweet); } /** * Extract all the usernames replied to from the tweet. * * A reply is an occurrence of a username at the beginning of a tweet. * * @param string $tweet The tweet to extract. * * @return array The usernames replied to in a tweet. */ public function extractReplyScreenname($tweet = null) { if (is_null($tweet)) { $tweet = $this->tweet; } $matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches); // Check username ending in if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) { $matched = false; } return $matched ? $matches[1] : null; } /** * Extract all the usernames replied to from the tweet. * * A reply is an occurrence of a username at the beginning of a tweet. * * @return array The usernames replied to in a tweet. * * @deprecated since version 1.1.0 */ public function extractRepliedUsernames() { return $this->extractReplyScreenname(); } /** * Extracts all the hashtags and the indices they occur at from the tweet. * * @param string $tweet The tweet to extract. * @param bool $checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove overlapping ones * * @return array The hashtag elements in the tweet. */ public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = true) { if (is_null($tweet)) { $tweet = $this->tweet; } if (!preg_match('/[##]/iu', $tweet)) { return []; } preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); $tags = []; foreach ($matches as $match) { list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, ['', 0]); $start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1]; $end_position = $start_position + StringUtils::strlen($hash[0].$hashtag[0]); if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) { continue; } $tags[] = [ 'hashtag' => $hashtag[0], 'indices' => [$start_position, $end_position], ]; } if (!$checkUrlOverlap) { return $tags; } // check url overlap $urls = $this->extractURLsWithIndices($tweet); $entities = $this->removeOverlappingEntities(array_merge($tags, $urls)); $validTags = []; foreach ($entities as $entity) { if (empty($entity['hashtag'])) { continue; } $validTags[] = $entity; } return $validTags; } /** * Extracts all the cashtags and the indices they occur at from the tweet. * * @param string $tweet The tweet to extract. * * @return array The cashtag elements in the tweet. */ public function extractCashtagsWithIndices($tweet = null) { if (is_null($tweet)) { $tweet = $this->tweet; } if (!preg_match('/\$/iu', $tweet)) { return []; } preg_match_all(self::$patterns['valid_cashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); $tags = []; foreach ($matches as $match) { list($all, $before, $dollar, $cash_text, $outer) = array_pad($match, 3, ['', 0]); $start_position = $dollar[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $dollar[1])) : $dollar[1]; $end_position = $start_position + StringUtils::strlen($dollar[0].$cash_text[0]); if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) { continue; } $tags[] = [ 'cashtag' => $cash_text[0], 'indices' => [$start_position, $end_position], ]; } return $tags; } /** * Extracts all the URLs and the indices they occur at from the tweet. * * @param string $tweet The tweet to extract. * * @return array The URLs elements in the tweet. */ public function extractURLsWithIndices($tweet = null) { if (is_null($tweet)) { $tweet = $this->tweet; } $needle = $this->extractURLWithoutProtocol() ? '.' : ':'; if (strpos($tweet, $needle) === false) { return []; } $urls = []; preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); foreach ($matches as $match) { list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, ['']); $start_position = $url[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $url[1])) : $url[1]; $end_position = $start_position + StringUtils::strlen($url[0]); $all = $all[0]; $before = $before[0]; $url = $url[0]; $protocol = $protocol[0]; $domain = $domain[0]; $port = $port[0]; $path = $path[0]; $query = $query[0]; // If protocol is missing and domain contains non-ASCII characters, // extract ASCII-only domains. if (empty($protocol)) { if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) { continue; } $last_url = null; $ascii_end_position = 0; if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) { $asciiDomain[0] = preg_replace('/'.preg_quote($domain, '/').'/u', $asciiDomain[0], $url); $ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position); $ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]); $last_url = [ 'url' => $asciiDomain[0], 'indices' => [$start_position + $ascii_start_position, $start_position + $ascii_end_position], ]; if (!empty($path) || preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0]) || !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) { $urls[] = $last_url; } } // no ASCII-only domain found. Skip the entire URL if (empty($last_url)) { continue; } // $last_url only contains domain. Need to add path and query if they exist. if (!empty($path)) { // last_url was not added. Add it to urls here. $last_url['url'] = preg_replace('/'.preg_quote($domain, '/').'/u', $last_url['url'], $url); $last_url['indices'][1] = $end_position; } } else { // In the case of t.co URLs, don't allow additional path characters if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) { $url = $tcoUrlMatches[0]; $end_position = $start_position + StringUtils::strlen($url); } $urls[] = [ 'url' => $url, 'indices' => [$start_position, $end_position], ]; } } return $urls; } /** * Extracts all the usernames and the indices they occur at from the tweet. * * @param string $tweet The tweet to extract. * * @return array The username elements in the tweet. */ public function extractMentionedScreennamesWithIndices($tweet = null) { if (is_null($tweet)) { $tweet = $this->tweet; } $usernamesOnly = []; $mentions = $this->extractMentionsOrListsWithIndices($tweet); foreach ($mentions as $mention) { if (isset($mention['list_slug'])) { unset($mention['list_slug']); } $usernamesOnly[] = $mention; } return $usernamesOnly; } /** * Extracts all the usernames and the indices they occur at from the tweet. * * @return array The username elements in the tweet. * * @deprecated since version 1.1.0 */ public function extractMentionedUsernamesWithIndices() { return $this->extractMentionedScreennamesWithIndices(); } /** * Extracts all the usernames and the indices they occur at from the tweet. * * @param string $tweet The tweet to extract. * * @return array The username elements in the tweet. */ public function extractMentionsOrListsWithIndices($tweet = null) { if (is_null($tweet)) { $tweet = $this->tweet; } if (!preg_match('/[@@]/iu', $tweet)) { return []; } preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); $results = []; foreach ($matches as $match) { list($all, $before, $at, $username, $list_slug, $outer) = array_pad($match, 6, ['', 0]); $start_position = $at[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $at[1])) : $at[1]; $end_position = $start_position + StringUtils::strlen($at[0]) + StringUtils::strlen($username[0]); $entity = [ 'screen_name' => $username[0], 'list_slug' => $list_slug[0], 'indices' => [$start_position, $end_position], ]; if (preg_match(self::$patterns['end_mention_match'], $outer[0])) { continue; } if (!empty($list_slug[0])) { $entity['indices'][1] = $end_position + StringUtils::strlen($list_slug[0]); } $results[] = $entity; } return $results; } /** * Extracts all the usernames and the indices they occur at from the tweet. * * @return array The username elements in the tweet. * * @deprecated since version 1.1.0 */ public function extractMentionedUsernamesOrListsWithIndices() { return $this->extractMentionsOrListsWithIndices(); } /** * setter/getter for extractURLWithoutProtocol. * * @param bool $flag * * @return Extractor */ public function extractURLWithoutProtocol($flag = null) { if (is_null($flag)) { return $this->extractURLWithoutProtocol; } $this->extractURLWithoutProtocol = (bool) $flag; return $this; } /** * Remove overlapping entities. * This returns a new array with no overlapping entities. * * @param array $entities * * @return array */ public function removeOverlappingEntities($entities) { $result = []; usort($entities, [$this, 'sortEntites']); $prev = null; foreach ($entities as $entity) { if (isset($prev) && $entity['indices'][0] < $prev['indices'][1]) { continue; } $prev = $entity; $result[] = $entity; } return $result; } /** * sort by entity start index. * * @param array $a * @param array $b * * @return int */ protected function sortEntites($a, $b) { if ($a['indices'][0] == $b['indices'][0]) { return 0; } return ($a['indices'][0] < $b['indices'][0]) ? -1 : 1; } }