[gen] Bugfixes in the search machinery.

2015-01-02 16:16:48 +01:00 · 2015-01-02 16:16:48 +01:00 · 225ea927a4
commit 225ea927a4
parent cf2cbc52d6
16 changed files with 81 additions and 43 deletions
--- a/gen/utils.py
+++ b/gen/utils.py
@ -91,18 +91,35 @@ class SomeObjects:
        else:               getMethod = 'getObject'
        self.objects = [getattr(b, getMethod)() for b in brains]

+# ------------------------------------------------------------------------------
+def splitIntoWords(text, ignore=2):
+    '''Split the cleaned index value p_text into words (returns a list of
+       words). Words whose length is below p_ignore are ignored, excepted digits
+       which are always kept. Duplicate words are removed (result is a set and
+       not a list).'''
+    # Split p_text into words
+    res = text.split()
+    # Remove shorter words not being figures
+    i = len(res) - 1
+    while i > -1:
+        if (len(res[i]) <= ignore) and not res[i].isdigit():
+            del res[i]
+        i -= 1
+    # Remove duplicates
+    return set(res)
+
 # ------------------------------------------------------------------------------
 class Keywords:
    '''This class allows to handle keywords that a user enters and that will be
       used as basis for performing requests in a TextIndex/XhtmlIndex.'''

    toRemove = '?-+*()'
-    def __init__(self, keywords, operator='AND'):
-        # Clean the p_keywords that the user has entered.
+    def __init__(self, keywords, operator='AND', ignore=2):
+        # Clean the p_keywords that the user has entered
        words = sutils.normalizeText(keywords)
        if words == '*': words = ''
        for c in self.toRemove: words = words.replace(c, ' ')
-        self.keywords = words.split()
+        self.keywords = splitIntoWords(words, ignore=ignore)
        # Store the operator to apply to the keywords (AND or OR)
        self.operator = operator