From a2ae8397044763ae2e82a4ce73d281e99a9a2be1 Mon Sep 17 00:00:00 2001 From: Gaetan Delannay Date: Tue, 25 Sep 2012 21:43:45 +0200 Subject: [PATCH] [gen] Added the possiblity to extract in a clean way text from XHTML field values. --- gen/indexer.py | 49 ++++++++++++++++++++++++++++++++++++++++++++ shared/utils.py | 3 +++ shared/xml_parser.py | 10 +++++++-- 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 gen/indexer.py diff --git a/gen/indexer.py b/gen/indexer.py new file mode 100644 index 0000000..915c10b --- /dev/null +++ b/gen/indexer.py @@ -0,0 +1,49 @@ +'''This file defines code for extracting, from field values, the text to be + indexed.''' + +# ------------------------------------------------------------------------------ +from Products.ZCTextIndex.PipelineFactory import element_factory +from appy.shared.xml_parser import XmlParser +from appy.shared.utils import normalizeString + +# ------------------------------------------------------------------------------ +class XhtmlTextExtractor(XmlParser): + '''Extracts text from XHTML.''' + def startDocument(self): + XmlParser.startDocument(self) + self.res = [] + + def endDocument(self): + self.res = ' '.join(self.res) + return XmlParser.endDocument(self) + + def characters(self, content): + c = normalizeString(content, usage='extractedText').strip().lower() + if len(c) > 1: self.res.append(c) + return self.env + + # Do not raise exceptions when errors occur. + def error(self, error): pass + def fatalError(self, error): pass + def warning(self, error): pass + +# ------------------------------------------------------------------------------ +class XhtmlIndexer: + '''Extracts, from XHTML field values, the text to index.''' + def process(self, text): + # Wrap the XHTML chunk into a root tag, to get valid XML. + text = '

%s

' % text[0] + parser = XhtmlTextExtractor() + text = parser.parse(text) + res = text.split(' ') + # Remove tokens of a single char. + i = len(res)-1 + while i > -1 : + if (len(res[i]) < 2) and not res[i].isdigit(): + del res[i] + i -= 1 + return res + +# ------------------------------------------------------------------------------ +element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer) +# ------------------------------------------------------------------------------ diff --git a/shared/utils.py b/shared/utils.py index 0701c9f..d9db99b 100644 --- a/shared/utils.py +++ b/shared/utils.py @@ -206,6 +206,9 @@ def normalizeString(s, usage='fileName'): # We work in unicode. Convert p_s to unicode if not unicode. if isinstance(s, str): s = s.decode('utf-8') elif not isinstance(s, unicode): s = unicode(s) + if usage == 'extractedText': + # Replace single quotes with blanks. + s = s.replace("'", " ").replace(u'’', ' ') # Remove any special char like accents. s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') # Remove any other char, depending on p_usage. diff --git a/shared/xml_parser.py b/shared/xml_parser.py index 2994fdd..0386889 100644 --- a/shared/xml_parser.py +++ b/shared/xml_parser.py @@ -19,8 +19,7 @@ # ------------------------------------------------------------------------------ import xml.sax, difflib, types, cgi -from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\ - property_interning_dict +from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges from xml.sax.xmlreader import InputSource from xml.sax import SAXParseException from appy.shared import UnicodeBuffer, xmlPrologue @@ -170,6 +169,13 @@ class XmlParser(ContentHandler, ErrorHandler): # Put a question mark instead of raising an exception. self.characters('?') + # ErrorHandler methods --------------------------------------------------- + # Define methods below in your subclass if you want error handling that + # does not raise exceptions, but produces a partial result instead. + #def error(self, error): pass + #def fatalError(self, error): pass + #def warning(self, error): pass + def parse(self, xml, source='string'): '''Parses a XML stream. * If p_source is "string", p_xml must be a string containing