Page MenuHomePhabricator
Paste P4875

first attempt at wikidata prefix search
ActivePublic

Authored by EBernhardson on Feb 2 2017, 8:59 PM.
{
"_source": [
"namespace",
"title"
],
"query": {
"bool": {
"filter": {
"multi_match": {
"query": "Obama",
"fields": [
"labels_all.prefix",
"title.keyword"
]
}
},
"should": {
"multi_match": {
"query": "Obama",
"type": "best_fields",
"tie_breaker": 0,
"fields": [
"title.keyword",
"labels.fr.near_match^40",
"labels.fr.near_match_folded^30",
"labels.fr.prefix^15",
"labels.en.near_match^25",
"labels.en.near_match_folded^20",
"labels.en.prefix^10",
"labels_all.prefix^0.5"
]
}
}
}
},
"highlight": {
"pre_tags": [ "<span class='searchmatch'>" ],
"post_tags": [ "</span>" ],
"require_field_match": false,
"fields": {
"title.keyword": {
"type": "experimental",
"fragmenter": "none",
"number_of_fragments": 1
},
"labels.fr": {
"type": "experimental",
"fragmenter": "none",
"number_of_fragments": 1,
"options": { "skip_if_last_matched": true },
"matched_fields": [
"labels.fr.near_match",
"labels.fr.near_match_folded",
"labels.fr.prefix"
]
},
"labels.en": {
"type": "experimental",
"fragmenter": "none",
"number_of_fragments": 1,
"options": { "skip_if_last_matched": true },
"matched_fields": [
"labels.en.near_match",
"labels.en.near_match_folded",
"labels.en.prefix"
]
},
"labels_all.prefix": {
"type": "experimental",
"fragmenter": "none",
"number_of_fragments": 1,
"options": { "skip_if_last_matched": true }
}
}
},
"size": 20
}

Event Timeline

Query as it is generated now:

{
  "query": {
    "bool": {
      "should": [
        {
          "bool": {
            "filter": [
              {
                "match": {
                  "labels_all.prefix": "Obama"
                }
              }
            ],
            "should": [
              {
                "multi_match": {
                  "type": "best_fields",
                  "tie_breaker": 0,
                  "query": "Obama",
                  "fields": [
                    "labels_all.near_match^1",
                    "labels_fr.near_match^40",
                    "labels_fr.near_match_folded^30",
                    "labels_fr.prefix^15",
                    "labels_en.near_match^25",
                    "labels_en.near_match_folded^20",
                    "labels_en.prefix^10"
                  ]
                }
              }
            ]
          }
        },
        {
          "term": {
            "title.keyword": "Obama"
          }
        }
      ],
      "minimum_number_should_match": 1,
      "filter": [
      ]
    }
  },
  "_source": [
    "namespace",
    "title",
    "labels.fr",
    "labels.en"
  ],
  "fields": false,
  "highlight": {
    "pre_tags": [
      ""
    ],
    "post_tags": [
      ""
    ],
    "fields": {
      "title": {
        "type": "experimental",
        "fragmenter": "none",
        "number_of_fragments": 0,
        "matched_fields": [
          "title.keyword"
        ]
      },
      "labels.fr.prefix": {
        "type": "experimental",
        "fragmenter": "none",
        "number_of_fragments": 0,
        "options": {
          "skip_if_last_matched": true
        }
      },
      "labels.en.prefix": {
        "type": "experimental",
        "fragmenter": "none",
        "number_of_fragments": 0,
        "options": {
          "skip_if_last_matched": true
        }
      }
    }
  }
}

Since the filter is on labels_all this query has the ability to return documents with a score of 0. I believe we need a final fallback with a very low boost to labels_all in the query and highlight stage as well. Otherwise a search for "Daleithiau", which means states in welsh and isn't present in the selected languages will return seemingly non-sensical results with no highlights:

{
  "hits": {
    "hits": [
      {
        "_source": {
          "labels": {
            "fr": [
              "Webtop"
            ],
            "en": [
              "Web desktop"
            ]
          },
          "title": "Q1780763",
          "namespace": 0
        },
        "_score": 0,
        "_id": "1712622",
        "_type": "page",
        "_index": "stas_wikidata_test"
      },
      {
        "_source": {
          "labels": {
            "fr": [
              "Complément orthogonal",
              "Complément Orthogonal",
              "Complement orthogonal"
            ],
            "en": [
              "orthogonal complement"
            ]
          },
          "title": "Q1780921",
          "namespace": 0
        },
        "_score": 0,
        "_id": "1712782",
        "_type": "page",
        "_index": "stas_wikidata_test"
      },
      ...
  }
}

alternatively the boolean should could be turned into a must, which will lower recall for terms not in the fallback language chain.