[gen] Indexer: textual content of indexes now keep only words of more than 2 chars and other improvements.
This commit is contained in:
parent
06c656d278
commit
fd5e88928d
|
@ -69,16 +69,17 @@ def updateIndexes(installer, indexInfo):
|
||||||
logger.info('Done.')
|
logger.info('Done.')
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
def splitIntoWords(text):
|
def splitIntoWords(text, ignore=2):
|
||||||
'''Split the cleaned index value p_text into words (returns a list of
|
'''Split the cleaned index value p_text into words (returns a list of
|
||||||
words). Words of a single char are ignored, excepted digits which are
|
words). Words whose length is below p_ignore are ignored, excepted digits
|
||||||
always kept. Duplicate words are removed (result is a set and not a
|
which are always kept. Duplicate words are removed (result is a set and
|
||||||
list).'''
|
not a list).'''
|
||||||
|
# Split p_text into words
|
||||||
res = text.split()
|
res = text.split()
|
||||||
# Remove tokens of a single char (excepted if this char is a digit).
|
# Remove shorter words not being figures
|
||||||
i = len(res) - 1
|
i = len(res) - 1
|
||||||
while i > -1:
|
while i > -1:
|
||||||
if (len(res[i]) < 2) and not res[i].isdigit():
|
if (len(res[i]) <= ignore) and not res[i].isdigit():
|
||||||
del res[i]
|
del res[i]
|
||||||
i -= 1
|
i -= 1
|
||||||
# Remove duplicates
|
# Remove duplicates
|
||||||
|
|
|
@ -244,6 +244,7 @@ class ToolMixin(BaseMixin):
|
||||||
# The search is triggered from an app-wide search
|
# The search is triggered from an app-wide search
|
||||||
klass = self.getAppyClass(className)
|
klass = self.getAppyClass(className)
|
||||||
fieldNames = getattr(klass, 'searchFields', None)
|
fieldNames = getattr(klass, 'searchFields', None)
|
||||||
|
if callable(fieldNames): fieldNames = fieldNames(self.appy())
|
||||||
if not fieldNames:
|
if not fieldNames:
|
||||||
# Gather all the indexed fields on this class
|
# Gather all the indexed fields on this class
|
||||||
fieldNames = [f.name for f in self.getAllAppyTypes(className) \
|
fieldNames = [f.name for f in self.getAllAppyTypes(className) \
|
||||||
|
|
|
@ -217,9 +217,12 @@ def executeCommand(cmd):
|
||||||
return res
|
return res
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
unwantedChars = ('\\', '/', ':', '*', '?', '"', '<', '>', '|', ' ', '\t', "'")
|
charsIgnore = u'.,:;*+=~?%^\'’"<>{}[]|\t\\'
|
||||||
|
fileNameIgnore = charsIgnore + u' $£€/'
|
||||||
|
extractIgnore = charsIgnore + '()'
|
||||||
alphaRex = re.compile('[a-zA-Z]')
|
alphaRex = re.compile('[a-zA-Z]')
|
||||||
alphanumRex = re.compile('[a-zA-Z0-9]')
|
alphanumRex = re.compile('[a-zA-Z0-9]')
|
||||||
|
|
||||||
def normalizeString(s, usage='fileName'):
|
def normalizeString(s, usage='fileName'):
|
||||||
'''Returns a version of string p_s whose special chars (like accents) have
|
'''Returns a version of string p_s whose special chars (like accents) have
|
||||||
been replaced with normal chars. Moreover, if p_usage is:
|
been replaced with normal chars. Moreover, if p_usage is:
|
||||||
|
@ -233,21 +236,25 @@ def normalizeString(s, usage='fileName'):
|
||||||
try:
|
try:
|
||||||
s = s.decode('utf-8')
|
s = s.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
# Another encoding may be in use.
|
# Another encoding may be in use
|
||||||
s = s.decode('latin-1')
|
s = s.decode('latin-1')
|
||||||
elif not isinstance(s, unicode): s = unicode(s)
|
elif not isinstance(s, unicode): s = unicode(s)
|
||||||
|
# For extracted text, replace any unwanted char with a blank
|
||||||
if usage == 'extractedText':
|
if usage == 'extractedText':
|
||||||
# Replace single quotes with blanks.
|
res = u''
|
||||||
s = s.replace("'", " ").replace(u'’', ' ')
|
for char in s:
|
||||||
# Remove any special char like accents.
|
if char not in extractIgnore: res += char
|
||||||
|
else: res += ' '
|
||||||
|
s = res
|
||||||
|
# Standardize special chars like accents
|
||||||
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
|
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
|
||||||
# Remove any other char, depending on p_usage.
|
# Remove any other char, depending on p_usage
|
||||||
if usage == 'fileName':
|
if usage == 'fileName':
|
||||||
# Remove any char that can't be found within a file name under
|
# Remove any char that can't be found within a file name under Windows
|
||||||
# Windows or that could lead to problems with OpenOffice.
|
# or that could lead to problems with LibreOffice.
|
||||||
res = ''
|
res = ''
|
||||||
for char in s:
|
for char in s:
|
||||||
if char not in unwantedChars: res += char
|
if char not in fileNameIgnore: res += char
|
||||||
elif usage.startswith('alpha'):
|
elif usage.startswith('alpha'):
|
||||||
exec 'rex = %sRex' % usage
|
exec 'rex = %sRex' % usage
|
||||||
res = ''
|
res = ''
|
||||||
|
@ -257,7 +264,7 @@ def normalizeString(s, usage='fileName'):
|
||||||
res = s
|
res = s
|
||||||
else:
|
else:
|
||||||
res = s
|
res = s
|
||||||
# Re-code the result as a str if a str was given.
|
# Re-code the result as a str if a str was given
|
||||||
if strNeeded: res = res.encode('utf-8')
|
if strNeeded: res = res.encode('utf-8')
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue