Page MenuHomePhabricator
Paste P13907

Customize CirrusSearch for czech
ActivePublic

Authored by dcausse on Jan 22 2021, 11:08 AM.
<?php
// To add in your LocalSettings
// Elasticsearch plugins required:
// - extra
// - analysis-icu
$wgHooks['CirrusSearchAnalysisConfig'][] = function ( array &$config, $builder ) {
// Add asciifolding_preserve to the plain analyzer as well (but not plain_search)
$config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve';
// Add asciifolding_preserve filters to lowercase_keyword (used for incategory/hastemplate)
$config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
// Unpack czech
$config['filter']['czech_stop'] = [
'type' => 'stop',
'stopwords' => '_czech_'
];
$config['filter']['czech_stemmer'] = [
'type' => 'stemmer',
'language' => 'czech'
];
/*
* Use this to prevent some words from being stemmed
$config['filter']['czech_keywords'] = [
'type' => 'keyword_marker',
'keywords' => ['příklad']
];
*/
$config['analyzer']['text'] = [
'type' => 'custom',
'tokenizer' => 'standard',
'char_filter' => [ 'word_break_helper' ],
'filter' => [
'lowercase',
'czech_stop',
//'czech_keywords',
'czech_stemmer',
'asciifolding',
]
];
// copy text to text_search
$config[ 'analyzer' ][ 'text_search' ] = $config[ 'analyzer' ][ 'text' ];
};
$wgLanguageCode = 'cs';
// set $wgCirrusSearchUseIcuFolding to 'no' if you only want to fold ASCII/Latin;
// ICU Folding is more aggressive and affects scripts other than Latin.
$wgCirrusSearchUseIcuFolding = 'yes';
// set of chars to ignore
// $wgCirrusSearchICUFoldingUnicodeSetFilter = "[^åäöÅÄÖ]";

Event Timeline

I edited one comment, added a comment to the text_search config, and removed a duplicate line adding asciifolding_preserve to plain. Looks good!

Added a comment about ICU folding, too.