[gen] Indexer: textual content of indexes now keep only words of more than 2 chars and other improvements.

2014-12-26 12:24:38 +01:00 · 2014-12-26 12:24:38 +01:00 · fd5e88928d
commit fd5e88928d
parent 06c656d278
3 changed files with 27 additions and 18 deletions
--- a/gen/indexer.py
+++ b/gen/indexer.py
@ -69,16 +69,17 @@ def updateIndexes(installer, indexInfo):
            logger.info('Done.')
 # ------------------------------------------------------------------------------
-def splitIntoWords(text):
+def splitIntoWords(text, ignore=2):
    '''Split the cleaned index value p_text into words (returns a list of
-       words). Words of a single char are ignored, excepted digits which are
+       words). Words whose length is below p_ignore are ignored, excepted digits
-       always kept. Duplicate words are removed (result is a set and not a
+       which are always kept. Duplicate words are removed (result is a set and
-       list).'''
+       not a list).'''
    # Split p_text into words
    res = text.split()
-    # Remove tokens of a single char (excepted if this char is a digit).
+    # Remove shorter words not being figures
    i = len(res) - 1
    while i > -1:
-        if (len(res[i]) < 2) and not res[i].isdigit():
+        if (len(res[i]) <= ignore) and not res[i].isdigit():
            del res[i]
        i -= 1
    # Remove duplicates
--- a/gen/mixins/ToolMixin.py
+++ b/gen/mixins/ToolMixin.py
@ -244,6 +244,7 @@ class ToolMixin(BaseMixin):
            # The search is triggered from an app-wide search
            klass = self.getAppyClass(className)
            fieldNames = getattr(klass, 'searchFields', None)
            if callable(fieldNames): fieldNames = fieldNames(self.appy())
            if not fieldNames:
                # Gather all the indexed fields on this class
                fieldNames = [f.name for f in self.getAllAppyTypes(className) \
--- a/shared/utils.py
+++ b/shared/utils.py
@ -217,9 +217,12 @@ def executeCommand(cmd):
    return res
 # ------------------------------------------------------------------------------
-unwantedChars = ('\\', '/', ':', '*', '?', '"', '<', '>', '|', ' ', '\t', "'")
+charsIgnore = u'.,:;*+=~?%^\'’"<>{}[]|\t\\'
 fileNameIgnore = charsIgnore + u' $£€/'
 extractIgnore = charsIgnore + '()'
 alphaRex = re.compile('[a-zA-Z]')
 alphanumRex = re.compile('[a-zA-Z0-9]')
 def normalizeString(s, usage='fileName'):
    '''Returns a version of string p_s whose special chars (like accents) have
       been replaced with normal chars. Moreover, if p_usage is:
@ -233,21 +236,25 @@ def normalizeString(s, usage='fileName'):
        try:
            s = s.decode('utf-8')
        except UnicodeDecodeError:
-            # Another encoding may be in use.
+            # Another encoding may be in use
            s = s.decode('latin-1')
    elif not isinstance(s, unicode): s = unicode(s)
    # For extracted text, replace any unwanted char with a blank
    if usage == 'extractedText':
-        # Replace single quotes with blanks.
+        res = u''
-        s = s.replace("'", " ").replace(u'’', ' ')
+        for char in s:
-    # Remove any special char like accents.
+            if char not in extractIgnore: res += char
            else: res += ' '
        s = res
    # Standardize special chars like accents
    s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
-    # Remove any other char, depending on p_usage.
+    # Remove any other char, depending on p_usage
    if usage == 'fileName':
-        # Remove any char that can't be found within a file name under
+        # Remove any char that can't be found within a file name under Windows
-        # Windows or that could lead to problems with OpenOffice.
+        # or that could lead to problems with LibreOffice.
        res = ''
        for char in s:
-            if char not in unwantedChars: res += char
+            if char not in fileNameIgnore: res += char
    elif usage.startswith('alpha'):
        exec 'rex = %sRex' % usage
        res = ''
@ -257,7 +264,7 @@ def normalizeString(s, usage='fileName'):
        res = s
    else:
        res = s
-    # Re-code the result as a str if a str was given.
+    # Re-code the result as a str if a str was given
    if strNeeded: res = res.encode('utf-8')
    return res