summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'Translate/ttmserver/ElasticSearchTTMServer.php')
-rw-r--r--Translate/ttmserver/ElasticSearchTTMServer.php864
1 files changed, 864 insertions, 0 deletions
diff --git a/Translate/ttmserver/ElasticSearchTTMServer.php b/Translate/ttmserver/ElasticSearchTTMServer.php
new file mode 100644
index 00000000..f3a950e1
--- /dev/null
+++ b/Translate/ttmserver/ElasticSearchTTMServer.php
@@ -0,0 +1,864 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @ingroup TTMServer
+ */
+
+use Elastica\Aggregation\Terms;
+use Elastica\Client;
+use Elastica\Document;
+use Elastica\Exception\ExceptionInterface;
+use Elastica\Query;
+use Elastica\Query\BoolQuery;
+use Elastica\Query\FunctionScore;
+use Elastica\Query\MatchQuery;
+use Elastica\Query\Term;
+use MediaWiki\Extension\Elastica\MWElasticUtils;
+use MediaWiki\Extension\Translate\TranslatorInterface\TranslationHelperException;
+use MediaWiki\Logger\LoggerFactory;
+
+/**
+ * TTMServer backed based on ElasticSearch. Depends on Elastica.
+ * @since 2014.04
+ * @ingroup TTMServer
+ */
+class ElasticSearchTTMServer
+ extends TTMServer
+ implements ReadableTTMServer, WritableTTMServer, SearchableTTMServer
+{
+ /**
+ * @const int in case a write operation fails during a batch process
+ * this constant controls the number of times we will retry the same
+ * operation.
+ */
+ private const BULK_INDEX_RETRY_ATTEMPTS = 5;
+
+ /**
+ * @const int time (seconds) to wait for the index to ready before
+ * starting to index. Since we wait for index status it can be relatively
+ * long especially if some nodes are restarted.
+ */
+ private const WAIT_UNTIL_READY_TIMEOUT = 3600;
+
+ /** @var Client */
+ protected $client;
+ /**
+ * Reference to the maintenance script to relay logging output.
+ */
+ protected $logger;
+ /**
+ * Used for Reindex
+ */
+ protected $updateMapping = false;
+
+ public function isLocalSuggestion( array $suggestion ) {
+ return $suggestion['wiki'] === WikiMap::getCurrentWikiId();
+ }
+
+ public function expandLocation( array $suggestion ) {
+ return $suggestion['uri'];
+ }
+
+ public function query( $sourceLanguage, $targetLanguage, $text ) {
+ try {
+ return $this->doQuery( $sourceLanguage, $targetLanguage, $text );
+ } catch ( Exception $e ) {
+ throw new TranslationHelperException( 'Elastica exception: ' . $e );
+ }
+ }
+
+ protected function doQuery( $sourceLanguage, $targetLanguage, $text ) {
+ if ( !$this->useWikimediaExtraPlugin() ) {
+ // ElasticTTM is currently not compatible with elasticsearch 2.x/5.x
+ // It needs FuzzyLikeThis ported via the wmf extra plugin
+ throw new RuntimeException( 'The wikimedia extra plugin is mandatory.' );
+ }
+ /* Two query system:
+ * 1) Find all strings in source language that match text
+ * 2) Do another query for translations for those strings
+ */
+ $connection = $this->getClient()->getConnection();
+ $oldTimeout = $connection->getTimeout();
+ $connection->setTimeout( 10 );
+
+ $fuzzyQuery = new FuzzyLikeThis();
+ $fuzzyQuery->setLikeText( $text );
+ $fuzzyQuery->addFields( [ 'content' ] );
+
+ $boostQuery = new FunctionScore();
+ $boostQuery->addFunction(
+ 'levenshtein_distance_score',
+ [
+ 'text' => $text,
+ 'field' => 'content'
+ ]
+ );
+ $boostQuery->setBoostMode( FunctionScore::BOOST_MODE_REPLACE );
+
+ // Wrap the fuzzy query so it can be used as a filter.
+ // This is slightly faster, as ES can throw away the scores by this query.
+ $bool = new BoolQuery();
+ $bool->addFilter( $fuzzyQuery );
+ $bool->addMust( $boostQuery );
+
+ $languageFilter = new Term();
+ $languageFilter->setTerm( 'language', $sourceLanguage );
+ $bool->addFilter( $languageFilter );
+
+ // The whole query
+ $query = new Query();
+ $query->setQuery( $bool );
+
+ // The interface usually displays three best candidates. These might
+ // come from more than three source things, if the translations are
+ // the same. In other words suggestions are grouped by the suggested
+ // translation. This algorithm might not find all suggestions, if the
+ // top N best matching source texts don't have equivalent translations
+ // in the target language, but worse matches which we did not fetch do.
+ // This code tries to balance between doing too many or too big queries
+ // and not fetching enough results to show all possible suggestions.
+ $sizeFirst = 100;
+ $sizeSecond = $sizeFirst * 5;
+
+ $query->setFrom( 0 );
+ $query->setSize( $sizeFirst );
+ $query->setParam( '_source', [ 'content' ] );
+ $cutoff = $this->config['cutoff'] ?? 0.65;
+ $query->setParam( 'min_score', $cutoff );
+ $query->setSort( [ '_score', 'wiki', 'localid' ] );
+
+ /* This query is doing two unrelated things:
+ * 1) Collect the message contents and scores so that they can
+ * be accessed later for the translations we found.
+ * 2) Build the query string for the query that fetches the translations.
+ */
+ $contents = $scores = $terms = [];
+ do {
+ $resultset = $this->getIndex()->search( $query );
+
+ if ( count( $resultset ) === 0 ) {
+ break;
+ }
+
+ foreach ( $resultset->getResults() as $result ) {
+ $data = $result->getData();
+ $score = $result->getScore();
+
+ $sourceId = preg_replace( '~/[^/]+$~', '', $result->getId() );
+ $contents[$sourceId] = $data['content'];
+ $scores[$sourceId] = $score;
+ $terms[] = "$sourceId/$targetLanguage";
+ }
+
+ // Check if it looks like that we are hitting the long tail already.
+ // Otherwise, we'll do a query to fetch some more to reach a "sane"
+ // breaking point, i.e. include all suggestions with same content
+ // for reliable used X times statistics.
+ if ( count( array_unique( $scores ) ) > 5 ) {
+ break;
+ }
+
+ // Okay, We are now in second iteration of the loop. We already got
+ // lots of suggestions. We will give up for now even if it means we
+ // return in some sense incomplete results.
+ if ( count( $resultset ) === $sizeSecond ) {
+ break;
+ }
+
+ // After the first query, the smallest score is the new threshold.
+ // @phan-suppress-next-line PhanPossiblyUndeclaredVariable
+ $query->setParam( 'min_score', $score );
+ $query->setFrom( $query->getParam( 'size' ) + $query->getParam( 'from' ) );
+ $query->setSize( $sizeSecond );
+
+ // Break if we already got all hits
+ } while ( $resultset->getTotalHits() > count( $contents ) );
+
+ $suggestions = [];
+
+ // Skip second query if first query found nothing. Keeping only one return
+ // statement in this method to avoid forgetting to reset connection timeout
+ if ( $terms !== [] ) {
+ $idQuery = new Query\Terms( '_id', $terms );
+
+ $query = new Query( $idQuery );
+ $query->setSize( 25 );
+ $query->setParam( '_source', [ 'wiki', 'uri', 'content', 'localid' ] );
+ $resultset = $this->getIndex()->search( $query );
+
+ foreach ( $resultset->getResults() as $result ) {
+ $data = $result->getData();
+
+ // Construct the matching source id
+ $sourceId = preg_replace( '~/[^/]+$~', '', $result->getId() );
+
+ $suggestions[] = [
+ 'source' => $contents[$sourceId],
+ 'target' => $data['content'],
+ 'context' => $data['localid'],
+ 'quality' => $scores[$sourceId],
+ 'wiki' => $data['wiki'],
+ 'location' => $data['localid'] . '/' . $targetLanguage,
+ 'uri' => $data['uri'],
+ ];
+ }
+
+ // Ensure results are in quality order
+ uasort( $suggestions, static function ( $a, $b ) {
+ if ( $a['quality'] === $b['quality'] ) {
+ return 0;
+ }
+
+ return ( $a['quality'] < $b['quality'] ) ? 1 : -1;
+ } );
+ }
+
+ $connection->setTimeout( $oldTimeout );
+
+ return $suggestions;
+ }
+
+ /* Write functions */
+
+ /**
+ * Add / update translations.
+ *
+ * @param MessageHandle $handle
+ * @param ?string $targetText
+ * @return bool
+ * @throws RuntimeException
+ */
+ public function update( MessageHandle $handle, $targetText ) {
+ if ( !$handle->isValid() || $handle->getCode() === '' ) {
+ return false;
+ }
+
+ /* There are various different cases here:
+ * [new or updated] [fuzzy|non-fuzzy] [translation|definition]
+ * 1) We don't distinguish between new or updated here.
+ * 2) Delete old translation, but not definition
+ * 3) Insert new translation or definition, if non-fuzzy
+ * The definition should never be fuzzied anyway.
+ *
+ * These only apply to known messages.
+ */
+
+ $sourceLanguage = $handle->getGroup()->getSourceLanguage();
+
+ // Do not delete definitions, because the translations are attached to that
+ if ( $handle->getCode() !== $sourceLanguage ) {
+ $localid = $handle->getTitleForBase()->getPrefixedText();
+ $this->deleteByQuery( $this->getIndex(), Query::create(
+ ( new BoolQuery() )
+ ->addFilter( new Term( [ 'wiki' => WikiMap::getCurrentWikiId() ] ) )
+ ->addFilter( new Term( [ 'language' => $handle->getCode() ] ) )
+ ->addFilter( new Term( [ 'localid' => $localid ] ) ) ) );
+ }
+
+ // If translation was made fuzzy, we do not need to add anything
+ if ( $targetText === null ) {
+ return true;
+ }
+
+ // source language is null, skip doing rest of the stuff
+ if ( $sourceLanguage === null ) {
+ return true;
+ }
+
+ $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID();
+ $doc = $this->createDocument( $handle, $targetText, $revId );
+ $fname = __METHOD__;
+
+ $mwElasticUtilsClass = $this->getMWElasticUtilsClass();
+ $mwElasticUtilsClass::withRetry( self::BULK_INDEX_RETRY_ATTEMPTS,
+ function () use ( $doc ) {
+ $this->getIndex()->addDocuments( [ $doc ] );
+ },
+ static function ( $e, $errors ) use ( $fname ) {
+ $c = get_class( $e );
+ $msg = $e->getMessage();
+ error_log( $fname . ": update failed ($c: $msg); retrying." );
+ sleep( 10 );
+ }
+ );
+
+ return true;
+ }
+
+ /**
+ * @param MessageHandle $handle
+ * @param string $text
+ * @param int $revId
+ * @return Document
+ */
+ protected function createDocument( MessageHandle $handle, $text, $revId ) {
+ $language = $handle->getCode();
+
+ $localid = $handle->getTitleForBase()->getPrefixedText();
+ $wiki = WikiMap::getCurrentWikiId();
+ $globalid = "$wiki-$localid-$revId/$language";
+
+ $data = [
+ 'wiki' => $wiki,
+ 'uri' => $handle->getTitle()->getCanonicalURL(),
+ 'localid' => $localid,
+ 'language' => $language,
+ 'content' => $text,
+ 'group' => $handle->getGroupIds(),
+ ];
+
+ return new Document( $globalid, $data, '_doc' );
+ }
+
+ /**
+ * Create index
+ * @param bool $rebuild Deletes index first if already exists
+ */
+ public function createIndex( $rebuild ) {
+ $indexSettings = [
+ 'settings' => [
+ 'index' => [
+ 'number_of_shards' => $this->getShardCount(),
+ 'analysis' => [
+ 'filter' => [
+ 'prefix_filter' => [
+ 'type' => 'edge_ngram',
+ 'min_gram' => 2,
+ 'max_gram' => 20
+ ]
+ ],
+ 'analyzer' => [
+ 'prefix' => [
+ 'type' => 'custom',
+ 'tokenizer' => 'standard',
+ 'filter' => [ 'lowercase', 'prefix_filter' ]
+ ],
+ 'casesensitive' => [
+ 'tokenizer' => 'standard'
+ ]
+ ]
+ ]
+ ],
+ ],
+ ];
+ $replicas = $this->getReplicaCount();
+ if ( strpos( $replicas, '-' ) === false ) {
+ $indexSettings['settings']['index']['number_of_replicas'] = $replicas;
+ } else {
+ $indexSettings['settings']['index']['auto_expand_replicas'] = $replicas;
+ }
+
+ $this->getIndex()->create( $indexSettings, $rebuild );
+ }
+
+ /**
+ * Begin the bootstrap process.
+ *
+ * @throws RuntimeException
+ */
+ public function beginBootstrap() {
+ $this->checkElasticsearchVersion();
+ $index = $this->getIndex();
+ if ( $this->updateMapping ) {
+ $this->logOutput( 'Updating the index mappings...' );
+ $this->createIndex( true );
+ } elseif ( !$index->exists() ) {
+ $this->createIndex( false );
+ }
+
+ $settings = $index->getSettings();
+ $settings->setRefreshInterval( '-1' );
+
+ $this->deleteByQuery( $this->getIndex(), Query::create(
+ ( new Term() )->setTerm( 'wiki', WikiMap::getCurrentWikiId() ) ) );
+
+ $properties = [
+ 'wiki' => [ 'type' => 'keyword' ],
+ 'localid' => [ 'type' => 'keyword' ],
+ 'uri' => [ 'type' => 'keyword' ],
+ 'language' => [ 'type' => 'keyword' ],
+ 'group' => [ 'type' => 'keyword' ],
+ 'content' => [
+ 'type' => 'text',
+ 'fields' => [
+ 'content' => [
+ 'type' => 'text',
+ 'term_vector' => 'yes'
+ ],
+ 'prefix_complete' => [
+ 'type' => 'text',
+ 'analyzer' => 'prefix',
+ 'search_analyzer' => 'standard',
+ 'term_vector' => 'yes'
+ ],
+ 'case_sensitive' => [
+ 'type' => 'text',
+ 'analyzer' => 'casesensitive',
+ 'term_vector' => 'yes'
+ ]
+ ]
+ ],
+ ];
+ if ( $this->useElastica6() ) {
+ // Elastica 6 support
+ // @phan-suppress-next-line PhanUndeclaredClassMethod
+ $mapping = new \Elastica\Type\Mapping();
+ // @phan-suppress-next-line PhanUndeclaredMethod, PhanUndeclaredClassMethod
+ $mapping->setType( $index->getType( '_doc' ) );
+ // @phan-suppress-next-line PhanUndeclaredClassMethod
+ $mapping->setProperties( $properties );
+ // @phan-suppress-next-line PhanUndeclaredClassMethod
+ $mapping->send( [ 'include_type_name' => 'true' ] );
+ } else {
+ // Elastica 7
+ $mapping = new \Elastica\Mapping( $properties );
+ $mapping->send( $index, [ 'include_type_name' => 'false' ] );
+ }
+
+ $this->waitUntilReady();
+ }
+
+ public function beginBatch() {
+ // I hate the rule that forbids {}
+ }
+
+ /**
+ * @param array[] $batch
+ * @phan-param array<int,array{0:MessageHandle,1:string,2:string}> $batch
+ */
+ public function batchInsertDefinitions( array $batch ) {
+ $lb = new LinkBatch();
+ foreach ( $batch as $data ) {
+ $lb->addObj( $data[0]->getTitle() );
+ }
+ $lb->execute();
+
+ $this->batchInsertTranslations( $batch );
+ }
+
+ public function batchInsertTranslations( array $batch ) {
+ $docs = [];
+ foreach ( $batch as $data ) {
+ [ $handle, $sourceLanguage, $text ] = $data;
+ $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID();
+ $docs[] = $this->createDocument( $handle, $text, $revId );
+ }
+
+ $mwElasticUtilsClass = $this->getMWElasticUtilsClass();
+ $mwElasticUtilsClass::withRetry( self::BULK_INDEX_RETRY_ATTEMPTS,
+ function () use ( $docs ) {
+ $this->getIndex()->addDocuments( $docs );
+ },
+ function ( $e, $errors ) {
+ $c = get_class( $e );
+ $msg = $e->getMessage();
+ $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" );
+ sleep( 10 );
+ }
+ );
+ }
+
+ public function endBatch() {
+ // I hate the rule that forbids {}
+ }
+
+ public function endBootstrap() {
+ $index = $this->getIndex();
+ $index->refresh();
+ $index->forcemerge();
+ $index->getSettings()->setRefreshInterval( '5s' );
+ }
+
+ public function getClient() {
+ if ( !$this->client ) {
+ if ( isset( $this->config['config'] ) ) {
+ $this->client = new Client( $this->config['config'] );
+ } else {
+ $this->client = new Client();
+ }
+ }
+ return $this->client;
+ }
+
+ /** @return true if the backend is configured with the wikimedia extra plugin */
+ public function useWikimediaExtraPlugin() {
+ return isset( $this->config['use_wikimedia_extra'] ) && $this->config['use_wikimedia_extra'];
+ }
+
+ /** @return string */
+ private function getIndexName() {
+ return $this->config['index'] ?? 'ttmserver';
+ }
+
+ public function getIndex() {
+ return $this->getClient()
+ ->getIndex( $this->getIndexName() );
+ }
+
+ protected function getShardCount() {
+ return $this->config['shards'] ?? 1;
+ }
+
+ protected function getReplicaCount() {
+ return $this->config['replicas'] ?? '0-2';
+ }
+
+ /**
+ * Get index health
+ * TODO: Remove this code in the future as we drop support for
+ * older versions of the Elastica extension.
+ *
+ * @param string $indexName
+ * @return array the index health status
+ */
+ protected function getIndexHealth( $indexName ) {
+ $path = "_cluster/health/$indexName";
+ $response = $this->getClient()->request( $path );
+ if ( $response->hasError() ) {
+ throw new Exception( "Error while fetching index health status: " . $response->getError() );
+ }
+ return $response->getData();
+ }
+
+ /**
+ * Wait for the index to go green
+ *
+ * NOTE: This method has been copied and adjusted from
+ * CirrusSearch/includes/Maintenance/ConfigUtils.php. Ideally we'd
+ * like to make these utility methods available in the Elastica
+ * extension, but this one requires some refactoring in cirrus first.
+ * TODO: Remove this code in the future as we drop support for
+ * older versions of the Elastica extension.
+ *
+ * @param string $indexName
+ * @param int $timeout
+ * @return bool true if the index is green false otherwise.
+ */
+ protected function waitForGreen( $indexName, $timeout ) {
+ $startTime = time();
+ while ( ( $startTime + $timeout ) > time() ) {
+ try {
+ $response = $this->getIndexHealth( $indexName );
+ $status = $response['status'] ?? 'unknown';
+ if ( $status === 'green' ) {
+ $this->logOutput( "\tGreen!" );
+ return true;
+ }
+ $this->logOutput( "\tIndex is $status retrying..." );
+ sleep( 5 );
+ } catch ( Exception $e ) {
+ $this->logOutput( "Error while waiting for green ({$e->getMessage()}), retrying..." );
+ }
+ }
+ return false;
+ }
+
+ protected function waitUntilReady() {
+ $mwElasticUtilsClass = $this->getMWElasticUtilsClass();
+ $statuses = $mwElasticUtilsClass::waitForGreen(
+ $this->getClient(),
+ $this->getIndexName(),
+ self::WAIT_UNTIL_READY_TIMEOUT );
+ $this->logOutput( "Waiting for the index to go green..." );
+ foreach ( $statuses as $message ) {
+ $this->logOutput( $message );
+ }
+
+ if ( !$statuses->getReturn() ) {
+ die( "Timeout! Please check server logs for {$this->getIndexName()}." );
+ }
+ }
+
+ public function setLogger( $logger ) {
+ $this->logger = $logger;
+ }
+
+ // Can it get any uglier?
+ protected function logOutput( $text ) {
+ if ( $this->logger ) {
+ $this->logger->statusLine( "$text\n" );
+ }
+ }
+
+ /**
+ * Force the update of index mappings
+ * @inheritDoc
+ */
+ public function setDoReIndex() {
+ $this->updateMapping = true;
+ }
+
+ /**
+ * Parse query string and build the search query
+ * @param string $queryString
+ * @param array $opts
+ * @return array
+ */
+ protected function parseQueryString( $queryString, array $opts ) {
+ $fields = $highlights = [];
+ $terms = preg_split( '/\s+/', $queryString );
+ $match = $opts['match'];
+ $case = $opts['case'];
+
+ // Map each word in the query string with its corresponding field
+ foreach ( $terms as $term ) {
+ $prefix = strstr( $term, '*', true );
+ if ( $prefix ) {
+ // For wildcard search
+ $fields['content.prefix_complete'][] = $prefix;
+ } elseif ( $case === '1' ) {
+ // For case sensitive search
+ $fields['content.case_sensitive'][] = $term;
+ } else {
+ $fields['content'][] = $term;
+ }
+ }
+
+ // Allow searching either by message content or message id (page name
+ // without language subpage) with exact match only.
+ $searchQuery = new BoolQuery();
+ foreach ( $fields as $analyzer => $words ) {
+ foreach ( $words as $word ) {
+ $boolQuery = new BoolQuery();
+ $contentQuery = new MatchQuery();
+ $contentQuery->setFieldQuery( $analyzer, $word );
+ $boolQuery->addShould( $contentQuery );
+ $messageQuery = new Term();
+ $messageQuery->setTerm( 'localid', $word );
+ $boolQuery->addShould( $messageQuery );
+
+ if ( $match === 'all' ) {
+ $searchQuery->addMust( $boolQuery );
+ } else {
+ $searchQuery->addShould( $boolQuery );
+ }
+
+ // Fields for highlighting
+ $highlights[$analyzer] = [
+ 'number_of_fragments' => 0
+ ];
+
+ // Allow searching by exact message title (page name with
+ // language subpage).
+ $title = Title::newFromText( $word );
+ if ( !$title ) {
+ continue;
+ }
+ $handle = new MessageHandle( $title );
+ if ( $handle->isValid() && $handle->getCode() !== '' ) {
+ $localid = $handle->getTitleForBase()->getPrefixedText();
+ $boolQuery = new BoolQuery();
+ $messageId = new Term();
+ $messageId->setTerm( 'localid', $localid );
+ $boolQuery->addMust( $messageId );
+ $searchQuery->addShould( $boolQuery );
+ }
+ }
+ }
+
+ return [ $searchQuery, $highlights ];
+ }
+
+ /**
+ * Search interface
+ * @param string $queryString
+ * @param array $opts
+ * @param array $highlight
+ * @return \Elastica\Search
+ */
+ public function createSearch( $queryString, $opts, $highlight ) {
+ $query = new Query();
+
+ [ $searchQuery, $highlights ] = $this->parseQueryString( $queryString, $opts );
+ $query->setQuery( $searchQuery );
+
+ $language = new Terms( 'language' );
+ $language->setField( 'language' );
+ $language->setSize( 500 );
+ $query->addAggregation( $language );
+
+ $group = new Terms( 'group' );
+ $group->setField( 'group' );
+ // Would like to prioritize the top level groups and not show subgroups
+ // if the top group has only few hits, but that doesn't seem to be possile.
+ $group->setSize( 500 );
+ $query->addAggregation( $group );
+
+ $query->setSize( $opts['limit'] );
+ $query->setFrom( $opts['offset'] );
+
+ // BoolAnd filters are executed in sequence per document. Bool filters with
+ // multiple must clauses are executed by converting each filter into a bit
+ // field then anding them together. The latter is normally faster if either
+ // of the subfilters are reused. May not make a difference in this context.
+ $filters = new BoolQuery();
+
+ $language = $opts['language'];
+ if ( $language !== '' ) {
+ $languageFilter = new Term();
+ $languageFilter->setTerm( 'language', $language );
+ $filters->addFilter( $languageFilter );
+ }
+
+ $group = $opts['group'];
+ if ( $group !== '' ) {
+ $groupFilter = new Term();
+ $groupFilter->setTerm( 'group', $group );
+ $filters->addFilter( $groupFilter );
+ }
+
+ // Check that we have at least one filter to avoid invalid query errors.
+ if ( $language !== '' || $group !== '' ) {
+ // TODO: This seems wrong, but perhaps for aggregation purposes?
+ // should make $search a must clause and use the bool query
+ // as main.
+ $query->setPostFilter( $filters );
+ }
+
+ [ $pre, $post ] = $highlight;
+ $query->setHighlight( [
+ // The value must be an object
+ 'pre_tags' => [ $pre ],
+ 'post_tags' => [ $post ],
+ 'fields' => $highlights,
+ ] );
+
+ return $this->getIndex()->createSearch( $query );
+ }
+
+ /**
+ * Search interface
+ * @param string $queryString
+ * @param array $opts
+ * @param array $highlight
+ * @throws TTMServerException
+ * @return \Elastica\ResultSet
+ */
+ public function search( $queryString, $opts, $highlight ) {
+ $search = $this->createSearch( $queryString, $opts, $highlight );
+
+ try {
+ return $search->search();
+ } catch ( ExceptionInterface $e ) {
+ throw new TTMServerException( $e->getMessage() );
+ }
+ }
+
+ /**
+ * @param \Elastica\ResultSet $resultset
+ * @return array
+ */
+ public function getFacets( $resultset ) {
+ $aggs = $resultset->getAggregations();
+ '@phan-var array[][][] $aggs';
+
+ $ret = [
+ 'language' => [],
+ 'group' => []
+ ];
+
+ foreach ( $aggs as $type => $info ) {
+ foreach ( $info['buckets'] as $row ) {
+ $ret[$type][$row['key']] = $row['doc_count'];
+ }
+ }
+
+ return $ret;
+ }
+
+ /**
+ * @param \Elastica\ResultSet $resultset
+ * @return int
+ */
+ public function getTotalHits( $resultset ) {
+ return $resultset->getTotalHits();
+ }
+
+ /**
+ * @param \Elastica\ResultSet $resultset
+ * @return array
+ */
+ public function getDocuments( $resultset ) {
+ $ret = [];
+ foreach ( $resultset->getResults() as $document ) {
+ $data = $document->getData();
+ $hl = $document->getHighlights();
+ if ( isset( $hl['content.prefix_complete'][0] ) ) {
+ $data['content'] = $hl['content.prefix_complete'][0];
+ } elseif ( isset( $hl['content.case_sensitive'][0] ) ) {
+ $data['content'] = $hl['content.case_sensitive'][0];
+ } elseif ( isset( $hl['content'][0] ) ) {
+ $data['content'] = $hl['content'][0];
+ }
+ $ret[] = $data;
+ }
+
+ return $ret;
+ }
+
+ /**
+ * Delete docs by query by using the scroll API.
+ * TODO: Elastica\Index::deleteByQuery() ? was removed
+ * in 2.x and returned in 5.x.
+ *
+ * @param \Elastica\Index $index the source index
+ * @param Query $query
+ * @throws RuntimeException
+ */
+ private function deleteByQuery( \Elastica\Index $index, Query $query ) {
+ try {
+ $mwElasticUtilsClass = $this->getMWElasticUtilsClass();
+ $mwElasticUtilsClass::deleteByQuery( $index, $query, /* $allowConflicts = */ true );
+ } catch ( Exception $e ) {
+ LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->error(
+ 'Problem encountered during deletion.',
+ [ 'exception' => $e ]
+ );
+
+ throw new RuntimeException( "Problem encountered during deletion.\n" . $e );
+ }
+ }
+
+ /**
+ * For MW < 1.38 MWElasticUtils was not namespaced in the Elastica extension
+ * Changed in Id29047c67a7d0bedc9a7e7ebd3879f21f82b2742
+ * @return string
+ */
+ private function getMWElasticUtilsClass(): string {
+ if ( class_exists( MWElasticUtils::class ) ) {
+ return MWElasticUtils::class;
+ } else {
+ return '\MWElasticUtils';
+ }
+ }
+
+ /* @throws RuntimeException */
+ private function getElasticsearchVersion(): string {
+ $response = $this->getClient()->request( '' );
+ if ( !$response->isOK() ) {
+ throw new \RuntimeException( "Cannot fetch elasticsearch version: " . $response->getError() );
+ }
+
+ $result = $response->getData();
+ if ( !isset( $result['version']['number'] ) ) {
+ throw new \RuntimeException( 'Unable to determine elasticsearch version, aborting.' );
+ }
+
+ return $result[ 'version' ][ 'number' ];
+ }
+
+ private function checkElasticsearchVersion() {
+ $version = $this->getElasticsearchVersion();
+ if ( strpos( $version, '6.8' ) !== 0 && strpos( $version, '7.' ) !== 0 ) {
+ throw new \RuntimeException( "Only Elasticsearch 6.8.x and 7.x are supported. Your version: $version." );
+ }
+ }
+
+ private function useElastica6(): bool {
+ return class_exists( '\Elastica\Type' );
+ }
+}