Support Questions

Find answers, ask questions, and share your expertise

ElasticSearch query to perform case-insensitive search with aggregations to get matching column counts

avatar
Contributor

We are using ElasticSearch 5.0.0.

Please let us know if there is any regex or any other way to perform case insensitive search.

13246-elasticsearch-query-movies-sample-data.jpg

Please find data in movies index in ElasticSearch in attachment.

Please find aggregation query to find fields matching search string “*drama*” in movies index:

GET /movies/_search?pretty

{

"size": 0,

"_source": false,

"query": {

"query_string": {

"analyze_wildcard": true,

"query": "*drama*"

}

},

"aggs": {

"distinct_tables_1": {

"terms": {

"field": "_type"

},

"aggs": {

"distinct_col_1": {

"terms": {

"field": "genres.keyword",

"include" : ".*drama.*"

}

}

}

},

"distinct_tables_2": {

"terms": {

"field": "_type"

},

"aggs": {

"distinct_col_2": {

"terms": {

"field": "director.keyword",

"include" : ".*drama.*"

}

}

}

},

"distinct_tables_3": {

"terms": {

"field": "_type"

},

"aggs": {

"distinct_col_3": {

"terms": {

"field": "theatre.keyword",

"include" : ".*drama.*"

}

}

}

}

}

}

We get the following response:

{

"took": 10,

"timed_out": false,

"_shards": {

"total": 5,

"successful": 5,

"failed": 0

},

"hits": {

"total": 4,

"max_score": 0,

"hits": []

},

"aggregations": {

"distinct_tables_1": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "movie_intrnl",

"doc_count": 2,

"distinct_col_1": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

},

{

"key": "movie_shows",

"doc_count": 2,

"distinct_col_1": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

}

]

},

"distinct_tables_2": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "movie_intrnl",

"doc_count": 2,

"distinct_col_2": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

},

{

"key": "movie_shows",

"doc_count": 2,

"distinct_col_2": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

}

]

},

"distinct_tables_3": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "movie_intrnl",

"doc_count": 2,

"distinct_col_3": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

},

{

"key": "movie_shows",

"doc_count": 2,

"distinct_col_3": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

}

]

}

}

}

It can be seen from the response that there are no matching columns values in response even though there are documents matching search string “drama”. The search for regex in aggregations appears to be case sensitive and so no values are returned.

We used this alternate query to find words matching Drama to perform case-insensitive search. However this uses only part word .*rama.* instead of Drama and it would be better to perform case-insensitive search.

GET /movies/_search?pretty

{

"size": 0,

"_source": false,

"query": {

"query_string": {

"analyze_wildcard": true,

"query": "*drama*"

}

},

"aggs": {

"distinct_tables_1": {

"terms": {

"field": "_type"

},

"aggs": {

"distinct_col_1": {

"terms": {

"field": "genres.keyword",

"include" : ".*rama.*"

}

}

}

},

"distinct_tables_2": {

"terms": {

"field": "_type"

},

"aggs": {

"distinct_col_2": {

"terms": {

"field": "director.keyword",

"include" : ".*rama.*"

}

}

}

},

"distinct_tables_3": {

"terms": {

"field": "_type"

},

"aggs": {

"distinct_col_3": {

"terms": {

"field": "theatre.keyword",

"include" : ".*rama.*"

}

}

}

}

}

}

Response for the query given above:

{

"took": 1,

"timed_out": false,

"_shards": {

"total": 5,

"successful": 5,

"failed": 0

},

"hits": {

"total": 4,

"max_score": 0,

"hits": []

},

"aggregations": {

"distinct_tables_1": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "movie_intrnl",

"doc_count": 2,

"distinct_col_1": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "BiographyDrama",

"doc_count": 1

},

{

"key": "Drama",

"doc_count": 1

}

]

}

},

{

"key": "movie_shows",

"doc_count": 2,

"distinct_col_1": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "BiographyDrama",

"doc_count": 1

},

{

"key": "Drama",

"doc_count": 1

}

]

}

}

]

},

"distinct_tables_2": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "movie_intrnl",

"doc_count": 2,

"distinct_col_2": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "Drama1",

"doc_count": 1

},

{

"key": "Drama4",

"doc_count": 1

}

]

}

},

{

"key": "movie_shows",

"doc_count": 2,

"distinct_col_2": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

}

]

},

"distinct_tables_3": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "movie_intrnl",

"doc_count": 2,

"distinct_col_3": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

},

{

"key": "movie_shows",

"doc_count": 2,

"distinct_col_3": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": [

{

"key": "Drama4",

"doc_count": 1

}

]

}

}

]

}

}

}

5 REPLIES 5

avatar
Super Guru
@Yogesh Sharma

Can you share your Elasticsearch index template that defines your field mappings? What type of analyzers and tokenizers are you using? You can define the mappings on a per-index basis:

https://www.elastic.co/guide/en/elasticsearch/reference/5.2/analysis.html

Are you using the standard analyzer?

https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html

avatar
Contributor

@Michael Young

We are using the default analyzer and tokenizer. The _settings endpoint for index does not provide the analyzer that is being used.

We are using default mappings for fields and we have not added any new templates.

Please find the mappings used for the index movies below:

{

"movies": {

"mappings": {

"movie_shows": {

"properties": {

"date": {

"type": "date"

},

"genres": {

"type": "text",

"fields": {

"keyword": {

"type": "keyword",

"ignore_above": 256

}

}

},

"id": {

"type": "long"

},

"theatre": {

"type": "text",

"fields": {

"keyword": {

"type": "keyword",

"ignore_above": 256

}

}

},

"title": {

"type": "text",

"fields": {

"keyword": {

"type": "keyword",

"ignore_above": 256

}

}

}

}

},

"movie_intrnl": {

"properties": {

"director": {

"type": "text",

"fields": {

"keyword": {

"type": "keyword",

"ignore_above": 256

}

}

},

"genres": {

"type": "text",

"fields": {

"keyword": {

"type": "keyword",

"ignore_above": 256

}

}

},

"id": {

"type": "long"

},

"title": {

"type": "text",

"fields": {

"keyword": {

"type": "keyword",

"ignore_above": 256

}

}

},

"year": {

"type": "long"

}

}

}

}

}

}

avatar
Super Guru

@Yogesh Sharma

Have you disabled the _all field? That is the catch-all field that is used for a query when you don't specify a field. Your queries are not specifying a specific field, so it should be going against the _all field. By default the _all field should be able to handle mixed-case queries.

Have you verified the query returns results without any of the aggregations:

GET /movies/_search?pretty
{
  "size": 10,
  "_source": false,
  "query": {
    "query_string": {
      "analyze_wildcard": true,
      "query": "*drama*"
     }
  }
}

avatar
Contributor

@Michael Young

The _all field is not disabled and we are getting the following response for the query.

Query:

GET /movies/_search?pretty

{

"size": 10,

"_source": false,

"query": {

"query_string": {

"analyze_wildcard": true,

"query": "*drama*"

}

}

}

Query Response:

{

"took": 1,

"timed_out": false,

"_shards": {

"total": 5,

"successful": 5,

"failed": 0

},

"hits": {

"total": 4,

"max_score": 1,

"hits": [

{

"_index": "movies",

"_type": "movie_intrnl",

"_id": "AVoYRhQexAEXKBamIeYy",

"_score": 1

},

{

"_index": "movies",

"_type": "movie_shows",

"_id": "AVoYRuxxxAEXKBamIeY2",

"_score": 1

},

{

"_index": "movies",

"_type": "movie_shows",

"_id": "AVoYRuxxxAEXKBamIeY4",

"_score": 1

},

{

"_index": "movies",

"_type": "movie_intrnl",

"_id": "AVoYRhQexAEXKBamIeYw",

"_score": 1

}

]

}

}

The high level intent is to identify fields and values from index matching search - for presence of keyword anywhere in the document and so the _all field is used.

avatar
Super Guru
@Yogesh Sharma

The _all field is analyzed by default, so you shouldn't have problems performing case-insensitive queries. You are also specifying the analyze_wildcard: true parameter which will attempt to analyze the query string with wildcards before running the query. As you have shown, the query itself returns hits. So the problem is with the aggregations.

For your aggregations you are using the include parameter. Can you try using ".*drama.*" as the include value instead of "*drama*"?