summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYury German <blueknight@gentoo.org>2022-06-15 12:08:35 -0400
committerYury German <blueknight@gentoo.org>2022-06-15 12:08:35 -0400
commit36d7691c33cb64ece817246e47a779ec648d10b0 (patch)
tree08f2fb95303a1d8eeba2c8629a24b35a91fb1cac /plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php
parenttwentyfourteen upg 2.7 to 3.2 and twentysixteen from 2.0 to 2.5 (diff)
downloadblogs-gentoo-36d7691c33cb64ece817246e47a779ec648d10b0.tar.gz
blogs-gentoo-36d7691c33cb64ece817246e47a779ec648d10b0.tar.bz2
blogs-gentoo-36d7691c33cb64ece817246e47a779ec648d10b0.zip
Openid-3.6.1 and jetpack-11.0 upgrade
Signed-off-by: Yury German <blueknight@gentoo.org>
Diffstat (limited to 'plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php')
-rw-r--r--plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php691
1 files changed, 0 insertions, 691 deletions
diff --git a/plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php b/plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php
deleted file mode 100644
index 42a82ede..00000000
--- a/plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php
+++ /dev/null
@@ -1,691 +0,0 @@
-<?php
-
-/**
- * Parse a pure text query into WordPress Elasticsearch query. This builds on
- * the Jetpack_WPES_Query_Builder() to provide search query parsing.
- *
- * The key part of this parser is taking a user's query string typed into a box
- * and converting it into an ES search query.
- *
- * This varies by application, but roughly it means extracting some parts of the query
- * (authors, tags, and phrases) that are treated as a filter. Then taking the
- * remaining words and building the correct query (possibly with prefix searching
- * if we are doing search as you type)
- *
- * This class only supports ES 2.x+
- *
- * This parser builds queries of the form:
- * bool:
- * must:
- * AND match of a single field (ideally an edgengram field)
- * filter:
- * filter clauses from context (eg @gibrown, #news, etc)
- * should:
- * boosting of results by various fields
- *
- * Features supported:
- * - search as you type
- * - phrases
- * - supports querying across multiple languages at once
- *
- * Example usage (from Search on Reader Manage):
- *
- * require_lib( 'jetpack-wpes-query-builder/jetpack-wpes-search-query-parser' );
- * $parser = new Jetpack_WPES_Search_Query_Parser( $args['q'], array( $lang ) );
- *
- * //author
- * $parser->author_field_filter( array(
- * 'prefixes' => array( '@' ),
- * 'wpcom_id_field' => 'author_id',
- * 'must_query_fields' => array( 'author.engram', 'author_login.engram' ),
- * 'boost_query_fields' => array( 'author^2', 'author_login^2', 'title.default.engram' ),
- * ) );
- *
- * //remainder of query
- * $match_content_fields = $parser->merge_ml_fields(
- * array(
- * 'all_content' => 0.1,
- * ),
- * array(
- * 'all_content.default.engram^0.1',
- * )
- * );
- * $boost_content_fields = $parser->merge_ml_fields(
- * array(
- * 'title' => 2,
- * 'description' => 1,
- * 'tags' => 1,
- * ),
- * array(
- * 'author_login^2',
- * 'author^2',
- * )
- * );
- *
- * $parser->phrase_filter( array(
- * 'must_query_fields' => $match_content_fields,
- * 'boost_query_fields' => $boost_content_fields,
- * ) );
- * $parser->remaining_query( array(
- * 'must_query_fields' => $match_content_fields,
- * 'boost_query_fields' => $boost_content_fields,
- * ) );
- *
- * //Boost on phrases
- * $parser->remaining_query( array(
- * 'boost_query_fields' => $boost_content_fields,
- * 'boost_query_type' => 'phrase',
- * ) );
- *
- * //boosting
- * $parser->add_max_boost_to_functions( 20 );
- * $parser->add_function( 'field_value_factor', array(
- * 'follower_count' => array(
- * 'modifier' => 'sqrt',
- * 'factor' => 1,
- * 'missing' => 0,
- * ) ) );
- *
- * //Filtering
- * $parser->add_filter( array(
- * 'exists' => array( 'field' => 'langs.' . $lang )
- * ) );
- *
- * //run the query
- * $es_query_args = array(
- * 'name' => 'feeds',
- * 'blog_id' => false,
- * 'security_strategy' => 'a8c',
- * 'type' => 'feed,blog',
- * 'fields' => array( 'blog_id', 'feed_id' ),
- * 'query' => $parser->build_query(),
- * 'filter' => $parser->build_filter(),
- * 'size' => $size,
- * 'from' => $from
- * );
- * $es_results = es_api_search_index( $es_query_args, 'api-feed-find' );
- *
- */
-
-jetpack_require_lib( 'jetpack-wpes-query-builder' );
-
-class Jetpack_WPES_Search_Query_Parser extends Jetpack_WPES_Query_Builder {
-
- protected $orig_query = '';
- protected $current_query = '';
- protected $langs;
- protected $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' );
-
- public function __construct( $user_query, $langs ) {
- $this->orig_query = $user_query;
- $this->current_query = $this->orig_query;
- $this->langs = $this->norm_langs( $langs );
- }
-
- protected $extracted_phrases = array();
-
- public function get_current_query() {
- return $this->current_query;
- }
-
- public function set_current_query( $q ) {
- $this->current_query = $q;
- }
-
- ///////////////////////////////////////////////////////
- // Methods for Building arrays of multilingual fields
-
- /*
- * Normalize language codes
- */
- public function norm_langs( $langs ) {
- $lst = array();
- foreach( $langs as $l ) {
- $l = strtok( $l, '-_' );
- if ( in_array( $l, $this->avail_langs ) ) {
- $lst[$l] = true;
- } else {
- $lst['default'] = true;
- }
- }
- return array_keys( $lst );
- }
-
- /*
- * Take a list of field prefixes and expand them for multi-lingual
- * with the provided boostings.
- */
- public function merge_ml_fields( $fields2boosts, $additional_fields ) {
- $flds = array();
- foreach( $fields2boosts as $f => $b ) {
- foreach( $this->langs as $l ) {
- $flds[] = $f . '.' . $l . '^' . $b;
- }
- }
- foreach( $additional_fields as $f ) {
- $flds[] = $f;
- }
- return $flds;
- }
-
- ////////////////////////////////////
- // Extract Fields for Filtering on
-
- /*
- * Extract any @mentions from the user query
- * use them as a filter if we can find a wp.com id
- * otherwise use them as a
- *
- * args:
- * wpcom_id_field: wp.com id field
- * must_query_fields: array of fields to search for matching results (optional)
- * boost_query_fields: array of fields to search in for boosting results (optional)
- * prefixes: array of prefixes that the user can use to indicate an author
- *
- * returns true/false of whether any were found
- *
- * See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
- */
- public function author_field_filter( $args ) {
- $defaults = array(
- 'wpcom_id_field' => 'author_id',
- 'must_query_fields' => null,
- 'boost_query_fields' => null,
- 'prefixes' => array( '@' ),
- );
- $args = wp_parse_args( $args, $defaults );
-
- $names = array();
- foreach( $args['prefixes'] as $p ) {
- $found = $this->get_fields( $p );
- if ( $found ) {
- foreach( $found as $f ) {
- $names[] = $f;
- }
- }
- }
-
- if ( empty( $names ) ) {
- return false;
- }
-
- foreach( $args['prefixes'] as $p ) {
- $this->remove_fields( $p );
- }
-
- $user_ids = array();
- $query_names = array();
-
- //loop through the matches and separate into filters and queries
- foreach( $names as $n ) {
- //check for exact match on login
- $userdata = get_user_by( 'login', strtolower( $n ) );
- $filtering = false;
- if ( $userdata ) {
- $user_ids[ $userdata->ID ] = true;
- $filtering = true;
- }
-
- $is_phrase = false;
- if ( preg_match( '/"/', $n ) ) {
- $is_phrase = true;
- $n = preg_replace( '/"/', '', $n );
- }
-
- if ( !empty( $args['must_query_fields'] ) && !$filtering ) {
- if ( $is_phrase ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['must_query_fields'],
- 'query' => $n,
- 'type' => 'phrase',
- ) ) );
- } else {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['must_query_fields'],
- 'query' => $n,
- ) ) );
- }
- }
-
- if ( !empty( $args['boost_query_fields'] ) ) {
- if ( $is_phrase ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $n,
- 'type' => 'phrase',
- ) ), 'should' );
- } else {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $n,
- ) ), 'should' );
- }
- }
- }
-
- if ( ! empty( $user_ids ) ) {
- $user_ids = array_keys( $user_ids );
- $this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) );
- }
-
- return true;
- }
-
- /*
- * Extract any prefix followed by text use them as a must clause,
- * and optionally as a boost to the should query
- * This can be used for hashtags. eg #News, or #"current events",
- * but also works for any arbitrary field. eg from:Greg
- *
- * args:
- * must_query_fields: array of fields that must match the tag (optional)
- * boost_query_fields: array of fields to boost search on (optional)
- * prefixes: array of prefixes that the user can use to indicate a tag
- *
- * returns true/false of whether any were found
- *
- */
- public function text_field_filter( $args ) {
- $defaults = array(
- 'must_query_fields' => array( 'tag.name' ),
- 'boost_query_fields' => array( 'tag.name' ),
- 'prefixes' => array( '#' ),
- );
- $args = wp_parse_args( $args, $defaults );
-
- $tags = array();
- foreach( $args['prefixes'] as $p ) {
- $found = $this->get_fields( $p );
- if ( $found ) {
- foreach( $found as $f ) {
- $tags[] = $f;
- }
- }
- }
-
- if ( empty( $tags ) ) {
- return false;
- }
-
- foreach( $args['prefixes'] as $p ) {
- $this->remove_fields( $p );
- }
-
- foreach( $tags as $t ) {
- $is_phrase = false;
- if ( preg_match( '/"/', $t ) ) {
- $is_phrase = true;
- $t = preg_replace( '/"/', '', $t );
- }
-
- if ( ! empty( $args['must_query_fields'] ) ) {
- if ( $is_phrase ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['must_query_fields'],
- 'query' => $t,
- 'type' => 'phrase',
- ) ) );
- } else {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['must_query_fields'],
- 'query' => $t,
- ) ) );
- }
- }
-
- if ( ! empty( $args['boost_query_fields'] ) ) {
- if ( $is_phrase ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $t,
- 'type' => 'phrase',
- ) ), 'should' );
- } else {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $t,
- ) ), 'should' );
- }
- }
- }
-
- return true;
- }
-
- /*
- * Extract anything surrounded by quotes or if there is an opening quote
- * that is not complete, and add them to the query as a phrase query.
- * Quotes can be either '' or ""
- *
- * args:
- * must_query_fields: array of fields that must match the phrases
- * boost_query_fields: array of fields to boost the phrases on (optional)
- *
- * returns true/false of whether any were found
- *
- */
- public function phrase_filter( $args ) {
- $defaults = array(
- 'must_query_fields' => array( 'all_content' ),
- 'boost_query_fields' => array( 'title' ),
- );
- $args = wp_parse_args( $args, $defaults );
-
- $phrases = array();
- if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) {
- foreach ( $matches[1] as $match ) {
- $phrases[] = $match;
- }
- $this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query );
- }
-
- if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) {
- foreach ( $matches[1] as $match ) {
- $phrases[] = $match;
- }
- $this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query );
- }
-
- //look for a final, uncompleted phrase
- $phrase_prefix = false;
- if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) {
- $phrase_prefix = $matches[1][0];
- $this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query );
- }
- if ( preg_match_all( "/(?:'\B|\B')([^']+)$/", $this->current_query, $matches ) ) {
- $phrase_prefix = $matches[1][0];
- $this->current_query = preg_replace( "/(?:'\B|\B')([^']+)$/", '', $this->current_query );
- }
-
- if ( $phrase_prefix ) {
- $phrases[] = $phrase_prefix;
- }
- if ( empty( $phrases ) ) {
- return false;
- }
-
- foreach ( $phrases as $p ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['must_query_fields'],
- 'query' => $p,
- 'type' => 'phrase',
- ) ) );
-
- if ( ! empty( $args['boost_query_fields'] ) ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $p,
- 'operator' => 'and',
- ) ), 'should' );
- }
- }
-
- return true;
- }
-
- /*
- * Query fields based on the remaining parts of the query
- * This could be the final AND part of the query terms to match, or it
- * could be boosting certain elements of the query
- *
- * args:
- * must_query_fields: array of fields that must match the remaining terms (optional)
- * boost_query_fields: array of fields to boost the remaining terms on (optional)
- *
- */
- public function remaining_query( $args ) {
- $defaults = array(
- 'must_query_fields' => null,
- 'boost_query_fields' => null,
- 'boost_operator' => 'and',
- 'boost_query_type' => 'best_fields',
- );
- $args = wp_parse_args( $args, $defaults );
-
- if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
- return;
- }
-
- if ( ! empty( $args['must_query_fields'] ) ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['must_query_fields'],
- 'query' => $this->current_query,
- 'operator' => 'and',
- ) ) );
- }
-
- if ( ! empty( $args['boost_query_fields'] ) ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $this->current_query,
- 'operator' => $args['boost_operator'],
- 'type' => $args['boost_query_type'],
- ) ), 'should' );
- }
-
- }
-
- /*
- * Query fields using a prefix query (alphabetical expansions on the index).
- * This is not recommended. Slower performance and worse relevancy.
- *
- * (UNTESTED! Copied from old prefix expansion code)
- *
- * args:
- * must_query_fields: array of fields that must match the remaining terms (optional)
- * boost_query_fields: array of fields to boost the remaining terms on (optional)
- *
- */
- public function remaining_prefix_query( $args ) {
- $defaults = array(
- 'must_query_fields' => array( 'all_content' ),
- 'boost_query_fields' => array( 'title' ),
- 'boost_operator' => 'and',
- 'boost_query_type' => 'best_fields',
- );
- $args = wp_parse_args( $args, $defaults );
-
- if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
- return;
- }
-
- //////////////////////////////////
- // Example cases to think about:
- // "elasticse"
- // "elasticsearch"
- // "elasticsearch "
- // "elasticsearch lucen"
- // "elasticsearch lucene"
- // "the future" - note the stopword which will match nothing!
- // "F1" - an exact match that also has tons of expansions
- // "こんにちは" ja "hello"
- // "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words
- // - this could still be better I bet. Maybe we need to analyze with ES first?
- //
-
- /////////////////////////////
- //extract pieces of query
- // eg: "PREFIXREMAINDER PREFIXWORD"
- // "elasticsearch lucen"
-
- $prefix_word = false;
- $prefix_remainder = false;
- if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) ) {
- $prefix_word = $matches[1][0];
- }
-
- $prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query );
- if ( ctype_space( $prefix_remainder ) ) {
- $prefix_remainder = false;
- }
-
- if ( ! $prefix_word ) {
- //Space at the end of the query, so skip using a prefix query
- if ( ! empty( $args['must_query_fields'] ) ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['must_query_fields'],
- 'query' => $this->current_query,
- 'operator' => 'and',
- ) ) );
- }
-
- if ( ! empty( $args['boost_query_fields'] ) ) {
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $this->current_query,
- 'operator' => $args['boost_operator'],
- 'type' => $args['boost_query_type'],
- ) ), 'should' );
- }
- } else {
-
- //must match the prefix word and the prefix remainder
- if ( ! empty( $args['must_query_fields'] ) ) {
- //need to do an OR across a few fields to handle all cases
- $must_q = array( 'bool' => array( 'should' => array( ), 'minimum_should_match' => 1 ) );
-
- //treat all words as an exact search (boosts complete word like "news"
- //from prefixes of "newspaper")
- $must_q['bool']['should'][] = array( 'multi_match' => array(
- 'fields' => $this->all_fields,
- 'query' => $full_text,
- 'operator' => 'and',
- 'type' => 'cross_fields',
- ) );
-
- //always optimistically try and match the full text as a phrase
- //prefix "the futu" should try to match "the future"
- //otherwise the first stopword kinda breaks
- //This also works as the prefix match for a single word "elasticsea"
- $must_q['bool']['should'][] = array( 'multi_match' => array(
- 'fields' => $this->phrase_fields,
- 'query' => $full_text,
- 'operator' => 'and',
- 'type' => 'phrase_prefix',
- 'max_expansions' => 100,
- ) );
-
- if ( $prefix_remainder ) {
- //Multiple words found, so treat each word on its own and not just as
- //a part of a phrase
- //"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix
- $q['bool']['should'][] = array( 'bool' => array(
- 'must' => array(
- array( 'multi_match' => array(
- 'fields' => $this->phrase_fields,
- 'query' => $prefix_word,
- 'operator' => 'and',
- 'type' => 'phrase_prefix',
- 'max_expansions' => 100,
- ) ),
- array( 'multi_match' => array(
- 'fields' => $this->all_fields,
- 'query' => $prefix_remainder,
- 'operator' => 'and',
- 'type' => 'cross_fields',
- ) ),
- )
- ) );
- }
-
- $this->add_query( $must_q );
- }
-
- //Now add any boosting of the query
- if ( ! empty( $args['boost_query_fields'] ) ) {
- //treat all words as an exact search (boosts complete word like "news"
- //from prefixes of "newspaper")
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $this->current_query,
- 'operator' => $args['boost_query_operator'],
- 'type' => $args['boost_query_type'],
- ) ), 'should' );
-
- //optimistically boost the full phrase prefix match
- $this->add_query( array(
- 'multi_match' => array(
- 'fields' => $args['boost_query_fields'],
- 'query' => $this->current_query,
- 'operator' => 'and',
- 'type' => 'phrase_prefix',
- 'max_expansions' => 100,
- ) ) );
- }
- }
- }
-
- /*
- * Boost results based on the lang probability overlaps
- *
- * args:
- * langs2prob: list of languages to search in with associated boosts
- */
- public function boost_lang_probs( $langs2prob ) {
- foreach( $langs2prob as $l => $p ) {
- $this->add_function( 'field_value_factor', array(
- 'modifier' => 'none',
- 'factor' => $p,
- 'missing' => 0.01, //1% chance doc did not have right lang detected
- ) );
- }
- }
-
- ////////////////////////////////////
- // Helper Methods
-
- //Get the text after some prefix. eg @gibrown, or @"Greg Brown"
- protected function get_fields( $field_prefix ) {
- $regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/';
- if ( preg_match_all( $regex, $this->current_query, $match ) ) {
- return $match[1];
- }
- return false;
- }
-
- //Remove the prefix and text from the query
- protected function remove_fields( $field_name ) {
- $regex = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/';
- $this->current_query = preg_replace( $regex, '', $this->current_query );
- }
-
- //Best effort string truncation that splits on word breaks
- protected function truncate_string( $string, $limit, $break=" " ) {
- if ( mb_strwidth( $string ) <= $limit ) {
- return $string;
- }
-
- // walk backwards from $limit to find first break
- $breakpoint = $limit;
- $broken = false;
- while ( $breakpoint > 0 ) {
- if ( $break === mb_strimwidth( $string, $breakpoint, 1 ) ) {
- $string = mb_strimwidth( $string, 0, $breakpoint );
- $broken = true;
- break;
- }
- $breakpoint--;
- }
- // if we weren't able to find a break, need to chop mid-word
- if ( !$broken ) {
- $string = mb_strimwidth( $string, 0, $limit );
- }
- return $string;
- }
-
-}