org.apache.solr.update.processor
Class LanguageIdentifierUpdateProcessor

java.lang.Object
  extended by org.apache.solr.update.processor.UpdateRequestProcessor
      extended by org.apache.solr.update.processor.LanguageIdentifierUpdateProcessor
All Implemented Interfaces:
LangIdParams
Direct Known Subclasses:
LangDetectLanguageIdentifierUpdateProcessor, TikaLanguageIdentifierUpdateProcessor

public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestProcessor
implements LangIdParams

Identifies the language of a set of input fields. Also supports mapping of field names based on detected language.

See http://wiki.apache.org/solr/LanguageDetection

Since:
3.5
WARNING: This API is experimental and might change in incompatible ways in the next release.

Field Summary
protected  HashSet<String> allMapFieldsSet
           
protected  String docIdField
           
protected  boolean enabled
           
protected  boolean enableMapping
           
protected  boolean enforceSchema
           
protected  String[] fallbackFields
           
protected  String fallbackValue
           
protected  String[] inputFields
           
protected  String langField
           
protected  Pattern langPattern
           
protected  String langsField
           
protected  HashSet<String> langWhitelist
           
protected  HashMap<String,String> lcMap
           
protected static org.slf4j.Logger log
           
protected  String[] mapFields
           
protected  boolean mapIndividual
           
protected  HashSet<String> mapIndividualFieldsSet
           
protected  boolean mapKeepOrig
           
protected  boolean mapOverwrite
           
protected  Pattern mapPattern
           
protected  String mapReplaceStr
           
protected  boolean overwrite
           
protected  IndexSchema schema
           
protected  double threshold
           
protected  Pattern tikaSimilarityPattern
           
 
Fields inherited from class org.apache.solr.update.processor.UpdateRequestProcessor
next
 
Fields inherited from interface org.apache.solr.update.processor.LangIdParams
DOCID_FIELD_DEFAULT, DOCID_LANGFIELD_DEFAULT, DOCID_LANGSFIELD_DEFAULT, DOCID_PARAM, DOCID_THRESHOLD_DEFAULT, ENFORCE_SCHEMA, FALLBACK, FALLBACK_FIELDS, FIELDS_PARAM, LANG_FIELD, LANG_WHITELIST, LANGS_FIELD, LANGUAGE_ID, MAP_ENABLE, MAP_FL, MAP_INDIVIDUAL, MAP_INDIVIDUAL_FL, MAP_KEEP_ORIG, MAP_LCMAP, MAP_OVERWRITE, MAP_PATTERN, MAP_PATTERN_DEFAULT, MAP_REPLACE, MAP_REPLACE_DEFAULT, OVERWRITE, THRESHOLD
 
Constructor Summary
LanguageIdentifierUpdateProcessor(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next)
           
 
Method Summary
protected  String concatFields(SolrInputDocument doc, String[] fields)
           
protected abstract  List<DetectedLanguage> detectLanguage(String content)
          Detects language(s) from a string.
protected  String getMappedField(String currentField, String language)
          Returns the name of the field to map the current contents into, so that they are properly analyzed.
 boolean isEnabled()
          Tells if this processor is enabled or not
protected  SolrInputDocument process(SolrInputDocument doc)
          This is the main, testable process method called from processAdd()
 void processAdd(AddUpdateCommand cmd)
           
protected  String resolveLanguage(List<DetectedLanguage> languages, String fallbackLang)
          Chooses a language based on the list of candidates detected
protected  String resolveLanguage(String language, String fallbackLang)
          Chooses a language based on the list of candidates detected
 void setEnabled(boolean enabled)
           
 
Methods inherited from class org.apache.solr.update.processor.UpdateRequestProcessor
finish, processCommit, processDelete, processMergeIndexes, processRollback
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

log

protected static final org.slf4j.Logger log

enabled

protected boolean enabled

inputFields

protected String[] inputFields

mapFields

protected String[] mapFields

mapPattern

protected Pattern mapPattern

mapReplaceStr

protected String mapReplaceStr

langField

protected String langField

langsField

protected String langsField

docIdField

protected String docIdField

fallbackValue

protected String fallbackValue

fallbackFields

protected String[] fallbackFields

enableMapping

protected boolean enableMapping

mapKeepOrig

protected boolean mapKeepOrig

overwrite

protected boolean overwrite

mapOverwrite

protected boolean mapOverwrite

mapIndividual

protected boolean mapIndividual

enforceSchema

protected boolean enforceSchema

threshold

protected double threshold

langWhitelist

protected HashSet<String> langWhitelist

mapIndividualFieldsSet

protected HashSet<String> mapIndividualFieldsSet

allMapFieldsSet

protected HashSet<String> allMapFieldsSet

lcMap

protected HashMap<String,String> lcMap

schema

protected IndexSchema schema

tikaSimilarityPattern

protected final Pattern tikaSimilarityPattern

langPattern

protected final Pattern langPattern
Constructor Detail

LanguageIdentifierUpdateProcessor

public LanguageIdentifierUpdateProcessor(SolrQueryRequest req,
                                         SolrQueryResponse rsp,
                                         UpdateRequestProcessor next)
Method Detail

processAdd

public void processAdd(AddUpdateCommand cmd)
                throws IOException
Overrides:
processAdd in class UpdateRequestProcessor
Throws:
IOException

process

protected SolrInputDocument process(SolrInputDocument doc)
This is the main, testable process method called from processAdd()

Parameters:
doc - the SolrInputDocument to work on
Returns:
the modified SolrInputDocument

concatFields

protected String concatFields(SolrInputDocument doc,
                              String[] fields)

detectLanguage

protected abstract List<DetectedLanguage> detectLanguage(String content)
Detects language(s) from a string. Classes wishing to implement their own language detection module should override this method.

Parameters:
content - The content to identify
Returns:
List of detected language(s) according to RFC-3066

resolveLanguage

protected String resolveLanguage(String language,
                                 String fallbackLang)
Chooses a language based on the list of candidates detected

Parameters:
language - language code as a string
fallbackLang - the language code to use as a fallback
Returns:
a string of the chosen language

resolveLanguage

protected String resolveLanguage(List<DetectedLanguage> languages,
                                 String fallbackLang)
Chooses a language based on the list of candidates detected

Parameters:
languages - a List of DetectedLanguages with certainty score
fallbackLang - the language code to use as a fallback
Returns:
a string of the chosen language

getMappedField

protected String getMappedField(String currentField,
                                String language)
Returns the name of the field to map the current contents into, so that they are properly analyzed. For instance if the currentField is "text" and the code is "en", the new field would by default be "text_en". This method also performs custom regex pattern replace if configured. If enforceSchema=true and the resulting field name doesn't exist, then null is returned.

Parameters:
currentField - The current field name
language - the language code
Returns:
The new schema field name, based on pattern and replace, or null if illegal

isEnabled

public boolean isEnabled()
Tells if this processor is enabled or not

Returns:
true if enabled, else false

setEnabled

public void setEnabled(boolean enabled)


Copyright © 2000-2013 Apache Software Foundation. All Rights Reserved.