public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestProcessor implements LangIdParams
Modifier and Type | Field and Description |
---|---|
protected HashSet<String> |
allMapFieldsSet |
protected String |
docIdField |
protected boolean |
enabled |
protected boolean |
enableMapping |
protected boolean |
enforceSchema |
protected String[] |
fallbackFields |
protected String |
fallbackValue |
protected String[] |
inputFields |
protected String |
langField |
protected Pattern |
langPattern |
protected String |
langsField |
protected HashSet<String> |
langWhitelist |
protected HashMap<String,String> |
lcMap |
protected String[] |
mapFields |
protected boolean |
mapIndividual |
protected HashSet<String> |
mapIndividualFieldsSet |
protected boolean |
mapKeepOrig |
protected HashMap<String,String> |
mapLcMap |
protected boolean |
mapOverwrite |
protected Pattern |
mapPattern |
protected String |
mapReplaceStr |
protected int |
maxFieldValueChars |
protected int |
maxTotalChars |
protected boolean |
overwrite |
protected IndexSchema |
schema |
protected double |
threshold |
protected Pattern |
tikaSimilarityPattern |
next
DOCID_FIELD_DEFAULT, DOCID_LANGFIELD_DEFAULT, DOCID_LANGSFIELD_DEFAULT, DOCID_PARAM, DOCID_THRESHOLD_DEFAULT, ENFORCE_SCHEMA, FALLBACK, FALLBACK_FIELDS, FIELDS_PARAM, LANG_FIELD, LANG_WHITELIST, LANGS_FIELD, LANGUAGE_ID, LCMAP, MAP_ENABLE, MAP_FL, MAP_INDIVIDUAL, MAP_INDIVIDUAL_FL, MAP_KEEP_ORIG, MAP_LCMAP, MAP_OVERWRITE, MAP_PATTERN, MAP_PATTERN_DEFAULT, MAP_REPLACE, MAP_REPLACE_DEFAULT, MAX_FIELD_VALUE_CHARS, MAX_FIELD_VALUE_CHARS_DEFAULT, MAX_TOTAL_CHARS, MAX_TOTAL_CHARS_DEFAULT, OVERWRITE, THRESHOLD
Constructor and Description |
---|
LanguageIdentifierUpdateProcessor(SolrQueryRequest req,
SolrQueryResponse rsp,
UpdateRequestProcessor next) |
Modifier and Type | Method and Description |
---|---|
protected abstract List<DetectedLanguage> |
detectLanguage(SolrInputDocument content)
Detects language(s) from a string.
|
protected String |
getMappedField(String currentField,
String language)
Returns the name of the field to map the current contents into, so that they are properly analyzed.
|
boolean |
isEnabled()
Tells if this processor is enabled or not
|
protected String |
normalizeLangCode(String langCode)
Looks up language code in map (langid.lcmap) and returns mapped value
|
protected SolrInputDocument |
process(SolrInputDocument doc)
This is the main, testable process method called from processAdd()
|
void |
processAdd(AddUpdateCommand cmd) |
protected String |
resolveLanguage(List<DetectedLanguage> languages,
String fallbackLang)
Chooses a language based on the list of candidates detected
|
protected String |
resolveLanguage(String language,
String fallbackLang)
Chooses a language based on the list of candidates detected
|
void |
setEnabled(boolean enabled) |
close, doClose, finish, processCommit, processDelete, processMergeIndexes, processRollback
protected boolean enabled
protected String[] inputFields
protected String[] mapFields
protected Pattern mapPattern
protected String mapReplaceStr
protected String langField
protected String langsField
protected String docIdField
protected String fallbackValue
protected String[] fallbackFields
protected boolean enableMapping
protected boolean mapKeepOrig
protected boolean overwrite
protected boolean mapOverwrite
protected boolean mapIndividual
protected boolean enforceSchema
protected double threshold
protected IndexSchema schema
protected int maxFieldValueChars
protected int maxTotalChars
protected final Pattern tikaSimilarityPattern
protected final Pattern langPattern
public LanguageIdentifierUpdateProcessor(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next)
public void processAdd(AddUpdateCommand cmd) throws IOException
processAdd
in class UpdateRequestProcessor
IOException
protected SolrInputDocument process(SolrInputDocument doc)
doc
- the SolrInputDocument to work onprotected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content)
content
- The content to identifyprotected String resolveLanguage(String language, String fallbackLang)
language
- language code as a stringfallbackLang
- the language code to use as a fallbackprotected String resolveLanguage(List<DetectedLanguage> languages, String fallbackLang)
languages
- a List of DetectedLanguages with certainty scorefallbackLang
- the language code to use as a fallbackprotected String normalizeLangCode(String langCode)
langCode
- the language code string returned from detectorprotected String getMappedField(String currentField, String language)
currentField
- The current field namelanguage
- the language codepublic boolean isEnabled()
public void setEnabled(boolean enabled)
Copyright © 2000-2018 Apache Software Foundation. All Rights Reserved.