<ling:MOTConfig xmlns="exa:com.exalead.linguistic.v10">
<!-- Tokenizers -->
<ling:StandardTokenizer >
<ling:charOverrides>
<ling:StandardTokenizerOverride type="token" toOverride=":" />
</ling:charOverrides>
<ling:patternOverrides>
<ling:StandardTokenizerOverride type="token" toOverride="[[:alnum:]][&][[:alnum:]]" />
<ling:StandardTokenizerOverride type="token" toOverride="[[:alnum:]]*[.]net" />
<ling:StandardTokenizerOverride type="token" toOverride="[[:alnum:]]+[+]+" />
<ling:StandardTokenizerOverride type="token" toOverride="[[:alnum:]]+#" />
</ling:patternOverrides>
</ling:StandardTokenizer>
<!-- Normalizer -->
<ling:NormalizerConfig>
<ling:NormalizerIndexLower language="fr" word="thé" />
<ling:NormalizerIndexLower language="fr" word="maïs" />
</ling:NormalizerConfig>
<!-- Semantic Processors -->
<ling:Lemmatizer name="englishLemmatizer" language="en" />
<ling:Lemmatizer name="frenchLemmatizer" language="fr" />
</ling:MOTConfig>
You can see that the Standard Tokenizer accepts specific overrides:
You can also configure the Normalizer with normalization exceptions. Here, the forms
| ||||||||||||