Consider, for example, the Japanese content provided in the five following files:
It is normal for If you examine how the Therefore, it is normal for both DOC4 and DOC5 to match it. Query The processing of Japanese requests is context-dependent. At
index time, it might be indexed as either At index and search time, we also perform some Katakana/Hiragana conversions and Romanji (Latin forms) conversions. Note:
3DSpace Index uses the following linguistic
settings:
<LinguisticConfig xmlns="exa:com.exalead.linguistic.v10" version="1330354820991">
<TokenizationConfig name="tok0">
<StandardTokenizer concatAlphaNum="true" concatNumAlpha="true">
<GermanDesagglutination />
<DutchDesagglutination />
<NorwegianDesagglutination />
<charOverrides />
<patternOverrides>
<StandardTokenizerOverride type="token" toOverride="[[:alnum:]][&][[:alnum:]]" />
<StandardTokenizerOverride type="token" toOverride="[[:alnum:]]*[.](?i:net)" />
<StandardTokenizerOverride type="token" toOverride="[[:alnum:]]+[+]+" />
<StandardTokenizerOverride type="token" toOverride="[[:alnum:]]+#" />
<StandardTokenizerOverride type="token" toOverride="0"/>
<StandardTokenizerOverride type="token" toOverride="1"/>
<StandardTokenizerOverride type="token" toOverride="2"/>
<StandardTokenizerOverride type="token" toOverride="3"/>
<StandardTokenizerOverride type="token" toOverride="4"/>
<StandardTokenizerOverride type="token" toOverride="5"/>
<StandardTokenizerOverride type="token" toOverride="6"/>
<StandardTokenizerOverride type="token" toOverride="7"/>
<StandardTokenizerOverride type="token" toOverride="8"/>
<StandardTokenizerOverride type="token" toOverride="9"/>
</patternOverrides>
</StandardTokenizer>
<JapaneseTokenizer addRomanji="true" addMorphology="false" />
<ChineseTokenizer addSimplified="false" />
<FormIndexingConfig>
<Form tag="SubTokenizerLowercase" indexKind="1" />
<Form tag="SubTokenizerNormalize" indexKind="2" />
<Form tag="SubTokenizerConcatLowercase" indexKind="1" />
<Form tag="SubTokenizerConcatNormalize" indexKind="2" />
<Form tag="cjk" indexKind="2" />
<Form tag="jafactorized" indexKind="42" />
<Form tag="jaexpanded" indexKind="43" />
<Form tag="jaromanji" indexKind="44" />
<Form tag="jaradicalfactor" indexKind="45" />
<Form tag="jaradicalexpand" indexKind="46" />
</FormIndexingConfig>
<NormalizerConfig useNormalizationExceptions="true" useGermanExceptions="false" />
</TokenizationConfig>
</LinguisticConfig>When indexing a string with a combination of single-byte and double-type characters (for
example,
| ||||||||||