diff options
author | Yury German <blueknight@gentoo.org> | 2022-06-15 12:08:35 -0400 |
---|---|---|
committer | Yury German <blueknight@gentoo.org> | 2022-06-15 12:08:35 -0400 |
commit | 36d7691c33cb64ece817246e47a779ec648d10b0 (patch) | |
tree | 08f2fb95303a1d8eeba2c8629a24b35a91fb1cac /plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php | |
parent | twentyfourteen upg 2.7 to 3.2 and twentysixteen from 2.0 to 2.5 (diff) | |
download | blogs-gentoo-36d7691c33cb64ece817246e47a779ec648d10b0.tar.gz blogs-gentoo-36d7691c33cb64ece817246e47a779ec648d10b0.tar.bz2 blogs-gentoo-36d7691c33cb64ece817246e47a779ec648d10b0.zip |
Openid-3.6.1 and jetpack-11.0 upgrade
Signed-off-by: Yury German <blueknight@gentoo.org>
Diffstat (limited to 'plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php')
-rw-r--r-- | plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php | 691 |
1 files changed, 0 insertions, 691 deletions
diff --git a/plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php b/plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php deleted file mode 100644 index 42a82ede..00000000 --- a/plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php +++ /dev/null @@ -1,691 +0,0 @@ -<?php - -/** - * Parse a pure text query into WordPress Elasticsearch query. This builds on - * the Jetpack_WPES_Query_Builder() to provide search query parsing. - * - * The key part of this parser is taking a user's query string typed into a box - * and converting it into an ES search query. - * - * This varies by application, but roughly it means extracting some parts of the query - * (authors, tags, and phrases) that are treated as a filter. Then taking the - * remaining words and building the correct query (possibly with prefix searching - * if we are doing search as you type) - * - * This class only supports ES 2.x+ - * - * This parser builds queries of the form: - * bool: - * must: - * AND match of a single field (ideally an edgengram field) - * filter: - * filter clauses from context (eg @gibrown, #news, etc) - * should: - * boosting of results by various fields - * - * Features supported: - * - search as you type - * - phrases - * - supports querying across multiple languages at once - * - * Example usage (from Search on Reader Manage): - * - * require_lib( 'jetpack-wpes-query-builder/jetpack-wpes-search-query-parser' ); - * $parser = new Jetpack_WPES_Search_Query_Parser( $args['q'], array( $lang ) ); - * - * //author - * $parser->author_field_filter( array( - * 'prefixes' => array( '@' ), - * 'wpcom_id_field' => 'author_id', - * 'must_query_fields' => array( 'author.engram', 'author_login.engram' ), - * 'boost_query_fields' => array( 'author^2', 'author_login^2', 'title.default.engram' ), - * ) ); - * - * //remainder of query - * $match_content_fields = $parser->merge_ml_fields( - * array( - * 'all_content' => 0.1, - * ), - * array( - * 'all_content.default.engram^0.1', - * ) - * ); - * $boost_content_fields = $parser->merge_ml_fields( - * array( - * 'title' => 2, - * 'description' => 1, - * 'tags' => 1, - * ), - * array( - * 'author_login^2', - * 'author^2', - * ) - * ); - * - * $parser->phrase_filter( array( - * 'must_query_fields' => $match_content_fields, - * 'boost_query_fields' => $boost_content_fields, - * ) ); - * $parser->remaining_query( array( - * 'must_query_fields' => $match_content_fields, - * 'boost_query_fields' => $boost_content_fields, - * ) ); - * - * //Boost on phrases - * $parser->remaining_query( array( - * 'boost_query_fields' => $boost_content_fields, - * 'boost_query_type' => 'phrase', - * ) ); - * - * //boosting - * $parser->add_max_boost_to_functions( 20 ); - * $parser->add_function( 'field_value_factor', array( - * 'follower_count' => array( - * 'modifier' => 'sqrt', - * 'factor' => 1, - * 'missing' => 0, - * ) ) ); - * - * //Filtering - * $parser->add_filter( array( - * 'exists' => array( 'field' => 'langs.' . $lang ) - * ) ); - * - * //run the query - * $es_query_args = array( - * 'name' => 'feeds', - * 'blog_id' => false, - * 'security_strategy' => 'a8c', - * 'type' => 'feed,blog', - * 'fields' => array( 'blog_id', 'feed_id' ), - * 'query' => $parser->build_query(), - * 'filter' => $parser->build_filter(), - * 'size' => $size, - * 'from' => $from - * ); - * $es_results = es_api_search_index( $es_query_args, 'api-feed-find' ); - * - */ - -jetpack_require_lib( 'jetpack-wpes-query-builder' ); - -class Jetpack_WPES_Search_Query_Parser extends Jetpack_WPES_Query_Builder { - - protected $orig_query = ''; - protected $current_query = ''; - protected $langs; - protected $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' ); - - public function __construct( $user_query, $langs ) { - $this->orig_query = $user_query; - $this->current_query = $this->orig_query; - $this->langs = $this->norm_langs( $langs ); - } - - protected $extracted_phrases = array(); - - public function get_current_query() { - return $this->current_query; - } - - public function set_current_query( $q ) { - $this->current_query = $q; - } - - /////////////////////////////////////////////////////// - // Methods for Building arrays of multilingual fields - - /* - * Normalize language codes - */ - public function norm_langs( $langs ) { - $lst = array(); - foreach( $langs as $l ) { - $l = strtok( $l, '-_' ); - if ( in_array( $l, $this->avail_langs ) ) { - $lst[$l] = true; - } else { - $lst['default'] = true; - } - } - return array_keys( $lst ); - } - - /* - * Take a list of field prefixes and expand them for multi-lingual - * with the provided boostings. - */ - public function merge_ml_fields( $fields2boosts, $additional_fields ) { - $flds = array(); - foreach( $fields2boosts as $f => $b ) { - foreach( $this->langs as $l ) { - $flds[] = $f . '.' . $l . '^' . $b; - } - } - foreach( $additional_fields as $f ) { - $flds[] = $f; - } - return $flds; - } - - //////////////////////////////////// - // Extract Fields for Filtering on - - /* - * Extract any @mentions from the user query - * use them as a filter if we can find a wp.com id - * otherwise use them as a - * - * args: - * wpcom_id_field: wp.com id field - * must_query_fields: array of fields to search for matching results (optional) - * boost_query_fields: array of fields to search in for boosting results (optional) - * prefixes: array of prefixes that the user can use to indicate an author - * - * returns true/false of whether any were found - * - * See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java - */ - public function author_field_filter( $args ) { - $defaults = array( - 'wpcom_id_field' => 'author_id', - 'must_query_fields' => null, - 'boost_query_fields' => null, - 'prefixes' => array( '@' ), - ); - $args = wp_parse_args( $args, $defaults ); - - $names = array(); - foreach( $args['prefixes'] as $p ) { - $found = $this->get_fields( $p ); - if ( $found ) { - foreach( $found as $f ) { - $names[] = $f; - } - } - } - - if ( empty( $names ) ) { - return false; - } - - foreach( $args['prefixes'] as $p ) { - $this->remove_fields( $p ); - } - - $user_ids = array(); - $query_names = array(); - - //loop through the matches and separate into filters and queries - foreach( $names as $n ) { - //check for exact match on login - $userdata = get_user_by( 'login', strtolower( $n ) ); - $filtering = false; - if ( $userdata ) { - $user_ids[ $userdata->ID ] = true; - $filtering = true; - } - - $is_phrase = false; - if ( preg_match( '/"/', $n ) ) { - $is_phrase = true; - $n = preg_replace( '/"/', '', $n ); - } - - if ( !empty( $args['must_query_fields'] ) && !$filtering ) { - if ( $is_phrase ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['must_query_fields'], - 'query' => $n, - 'type' => 'phrase', - ) ) ); - } else { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['must_query_fields'], - 'query' => $n, - ) ) ); - } - } - - if ( !empty( $args['boost_query_fields'] ) ) { - if ( $is_phrase ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $n, - 'type' => 'phrase', - ) ), 'should' ); - } else { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $n, - ) ), 'should' ); - } - } - } - - if ( ! empty( $user_ids ) ) { - $user_ids = array_keys( $user_ids ); - $this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) ); - } - - return true; - } - - /* - * Extract any prefix followed by text use them as a must clause, - * and optionally as a boost to the should query - * This can be used for hashtags. eg #News, or #"current events", - * but also works for any arbitrary field. eg from:Greg - * - * args: - * must_query_fields: array of fields that must match the tag (optional) - * boost_query_fields: array of fields to boost search on (optional) - * prefixes: array of prefixes that the user can use to indicate a tag - * - * returns true/false of whether any were found - * - */ - public function text_field_filter( $args ) { - $defaults = array( - 'must_query_fields' => array( 'tag.name' ), - 'boost_query_fields' => array( 'tag.name' ), - 'prefixes' => array( '#' ), - ); - $args = wp_parse_args( $args, $defaults ); - - $tags = array(); - foreach( $args['prefixes'] as $p ) { - $found = $this->get_fields( $p ); - if ( $found ) { - foreach( $found as $f ) { - $tags[] = $f; - } - } - } - - if ( empty( $tags ) ) { - return false; - } - - foreach( $args['prefixes'] as $p ) { - $this->remove_fields( $p ); - } - - foreach( $tags as $t ) { - $is_phrase = false; - if ( preg_match( '/"/', $t ) ) { - $is_phrase = true; - $t = preg_replace( '/"/', '', $t ); - } - - if ( ! empty( $args['must_query_fields'] ) ) { - if ( $is_phrase ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['must_query_fields'], - 'query' => $t, - 'type' => 'phrase', - ) ) ); - } else { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['must_query_fields'], - 'query' => $t, - ) ) ); - } - } - - if ( ! empty( $args['boost_query_fields'] ) ) { - if ( $is_phrase ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $t, - 'type' => 'phrase', - ) ), 'should' ); - } else { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $t, - ) ), 'should' ); - } - } - } - - return true; - } - - /* - * Extract anything surrounded by quotes or if there is an opening quote - * that is not complete, and add them to the query as a phrase query. - * Quotes can be either '' or "" - * - * args: - * must_query_fields: array of fields that must match the phrases - * boost_query_fields: array of fields to boost the phrases on (optional) - * - * returns true/false of whether any were found - * - */ - public function phrase_filter( $args ) { - $defaults = array( - 'must_query_fields' => array( 'all_content' ), - 'boost_query_fields' => array( 'title' ), - ); - $args = wp_parse_args( $args, $defaults ); - - $phrases = array(); - if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) { - foreach ( $matches[1] as $match ) { - $phrases[] = $match; - } - $this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query ); - } - - if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) { - foreach ( $matches[1] as $match ) { - $phrases[] = $match; - } - $this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query ); - } - - //look for a final, uncompleted phrase - $phrase_prefix = false; - if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) { - $phrase_prefix = $matches[1][0]; - $this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query ); - } - if ( preg_match_all( "/(?:'\B|\B')([^']+)$/", $this->current_query, $matches ) ) { - $phrase_prefix = $matches[1][0]; - $this->current_query = preg_replace( "/(?:'\B|\B')([^']+)$/", '', $this->current_query ); - } - - if ( $phrase_prefix ) { - $phrases[] = $phrase_prefix; - } - if ( empty( $phrases ) ) { - return false; - } - - foreach ( $phrases as $p ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['must_query_fields'], - 'query' => $p, - 'type' => 'phrase', - ) ) ); - - if ( ! empty( $args['boost_query_fields'] ) ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $p, - 'operator' => 'and', - ) ), 'should' ); - } - } - - return true; - } - - /* - * Query fields based on the remaining parts of the query - * This could be the final AND part of the query terms to match, or it - * could be boosting certain elements of the query - * - * args: - * must_query_fields: array of fields that must match the remaining terms (optional) - * boost_query_fields: array of fields to boost the remaining terms on (optional) - * - */ - public function remaining_query( $args ) { - $defaults = array( - 'must_query_fields' => null, - 'boost_query_fields' => null, - 'boost_operator' => 'and', - 'boost_query_type' => 'best_fields', - ); - $args = wp_parse_args( $args, $defaults ); - - if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) { - return; - } - - if ( ! empty( $args['must_query_fields'] ) ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['must_query_fields'], - 'query' => $this->current_query, - 'operator' => 'and', - ) ) ); - } - - if ( ! empty( $args['boost_query_fields'] ) ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $this->current_query, - 'operator' => $args['boost_operator'], - 'type' => $args['boost_query_type'], - ) ), 'should' ); - } - - } - - /* - * Query fields using a prefix query (alphabetical expansions on the index). - * This is not recommended. Slower performance and worse relevancy. - * - * (UNTESTED! Copied from old prefix expansion code) - * - * args: - * must_query_fields: array of fields that must match the remaining terms (optional) - * boost_query_fields: array of fields to boost the remaining terms on (optional) - * - */ - public function remaining_prefix_query( $args ) { - $defaults = array( - 'must_query_fields' => array( 'all_content' ), - 'boost_query_fields' => array( 'title' ), - 'boost_operator' => 'and', - 'boost_query_type' => 'best_fields', - ); - $args = wp_parse_args( $args, $defaults ); - - if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) { - return; - } - - ////////////////////////////////// - // Example cases to think about: - // "elasticse" - // "elasticsearch" - // "elasticsearch " - // "elasticsearch lucen" - // "elasticsearch lucene" - // "the future" - note the stopword which will match nothing! - // "F1" - an exact match that also has tons of expansions - // "こんにちは" ja "hello" - // "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words - // - this could still be better I bet. Maybe we need to analyze with ES first? - // - - ///////////////////////////// - //extract pieces of query - // eg: "PREFIXREMAINDER PREFIXWORD" - // "elasticsearch lucen" - - $prefix_word = false; - $prefix_remainder = false; - if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) ) { - $prefix_word = $matches[1][0]; - } - - $prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query ); - if ( ctype_space( $prefix_remainder ) ) { - $prefix_remainder = false; - } - - if ( ! $prefix_word ) { - //Space at the end of the query, so skip using a prefix query - if ( ! empty( $args['must_query_fields'] ) ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['must_query_fields'], - 'query' => $this->current_query, - 'operator' => 'and', - ) ) ); - } - - if ( ! empty( $args['boost_query_fields'] ) ) { - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $this->current_query, - 'operator' => $args['boost_operator'], - 'type' => $args['boost_query_type'], - ) ), 'should' ); - } - } else { - - //must match the prefix word and the prefix remainder - if ( ! empty( $args['must_query_fields'] ) ) { - //need to do an OR across a few fields to handle all cases - $must_q = array( 'bool' => array( 'should' => array( ), 'minimum_should_match' => 1 ) ); - - //treat all words as an exact search (boosts complete word like "news" - //from prefixes of "newspaper") - $must_q['bool']['should'][] = array( 'multi_match' => array( - 'fields' => $this->all_fields, - 'query' => $full_text, - 'operator' => 'and', - 'type' => 'cross_fields', - ) ); - - //always optimistically try and match the full text as a phrase - //prefix "the futu" should try to match "the future" - //otherwise the first stopword kinda breaks - //This also works as the prefix match for a single word "elasticsea" - $must_q['bool']['should'][] = array( 'multi_match' => array( - 'fields' => $this->phrase_fields, - 'query' => $full_text, - 'operator' => 'and', - 'type' => 'phrase_prefix', - 'max_expansions' => 100, - ) ); - - if ( $prefix_remainder ) { - //Multiple words found, so treat each word on its own and not just as - //a part of a phrase - //"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix - $q['bool']['should'][] = array( 'bool' => array( - 'must' => array( - array( 'multi_match' => array( - 'fields' => $this->phrase_fields, - 'query' => $prefix_word, - 'operator' => 'and', - 'type' => 'phrase_prefix', - 'max_expansions' => 100, - ) ), - array( 'multi_match' => array( - 'fields' => $this->all_fields, - 'query' => $prefix_remainder, - 'operator' => 'and', - 'type' => 'cross_fields', - ) ), - ) - ) ); - } - - $this->add_query( $must_q ); - } - - //Now add any boosting of the query - if ( ! empty( $args['boost_query_fields'] ) ) { - //treat all words as an exact search (boosts complete word like "news" - //from prefixes of "newspaper") - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $this->current_query, - 'operator' => $args['boost_query_operator'], - 'type' => $args['boost_query_type'], - ) ), 'should' ); - - //optimistically boost the full phrase prefix match - $this->add_query( array( - 'multi_match' => array( - 'fields' => $args['boost_query_fields'], - 'query' => $this->current_query, - 'operator' => 'and', - 'type' => 'phrase_prefix', - 'max_expansions' => 100, - ) ) ); - } - } - } - - /* - * Boost results based on the lang probability overlaps - * - * args: - * langs2prob: list of languages to search in with associated boosts - */ - public function boost_lang_probs( $langs2prob ) { - foreach( $langs2prob as $l => $p ) { - $this->add_function( 'field_value_factor', array( - 'modifier' => 'none', - 'factor' => $p, - 'missing' => 0.01, //1% chance doc did not have right lang detected - ) ); - } - } - - //////////////////////////////////// - // Helper Methods - - //Get the text after some prefix. eg @gibrown, or @"Greg Brown" - protected function get_fields( $field_prefix ) { - $regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/'; - if ( preg_match_all( $regex, $this->current_query, $match ) ) { - return $match[1]; - } - return false; - } - - //Remove the prefix and text from the query - protected function remove_fields( $field_name ) { - $regex = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/'; - $this->current_query = preg_replace( $regex, '', $this->current_query ); - } - - //Best effort string truncation that splits on word breaks - protected function truncate_string( $string, $limit, $break=" " ) { - if ( mb_strwidth( $string ) <= $limit ) { - return $string; - } - - // walk backwards from $limit to find first break - $breakpoint = $limit; - $broken = false; - while ( $breakpoint > 0 ) { - if ( $break === mb_strimwidth( $string, $breakpoint, 1 ) ) { - $string = mb_strimwidth( $string, 0, $breakpoint ); - $broken = true; - break; - } - $breakpoint--; - } - // if we weren't able to find a break, need to chop mid-word - if ( !$broken ) { - $string = mb_strimwidth( $string, 0, $limit ); - } - return $string; - } - -} |