Create Tokens
Basically, a token is the aggregation of:
-
A form (a piece of the input text),
-
A type (alphabetical, punctuation, separator, etc.),
-
And an array of annotations.
When developing a tokenizer, you have to create tokens and to define several of the following fields. The framework defines some of them automatically.
package com.exalead.mot.v10; public class AnnotatedToken { /// The token value (form) public String token; /// The token kind (type) public int kind; /// The language code (defined by com.exalead.lang.Language) public int lang; /// The position in the original text of this token (in terms of characters) public int offset; /// The list of annotations attached to this token public Annotation[] annotations; /// Returns the XML representation of this token and its annotations public String toString(); /// Returns the annotations attached to this token having the given tag /// If none, returns the empty list. public List<Annotation> getAnnotationsWithTag(String tag); /// Returns the annotations having any of the given tags. /// If none, returns the empty list. public List<Annotation> getAnnotationsWithTags(Collection<String> tags); /// Returns the annotations having the given tag and display form. /// If none, returns the empty list. public List<Annotation> getAnnotationsWithTagAndDisplay(String tag, String display); /// Token kinds and their default interpretation public final static int TOKEN_UNKNOWN = 0; /// unknown public final static int TOKEN_SEP_IGNORE = 1; /// separator [[:ctrl:]] public final static int TOKEN_SEP_SPACE = 2; /// space [[:space:]] public final static int TOKEN_SEP_SENTENCE = 4; /// sentence public final static int TOKEN_SEP_PARAGRAPH = 8; /// paragraph (\n\n) public final static int TOKEN_SEP_QUOTE = 16; /// quote ["'] public final static int TOKEN_SEP_DASH = 32; /// dash [-] public final static int TOKEN_SEP_PUNCT = 64; /// punct [[:punct:]] public final static int TOKEN_NUMBER = 128; /// number [0-9]+ public final static int TOKEN_ALPHANUM = 256; /// alphanum [a-zA-Z0-9]+ public final static int TOKEN_ALPHA = 512; /// alpha [a-zA-Z]+ }