[gen] Added the possiblity to extract in a clean way text from XHTML field values.
This commit is contained in:
		
							parent
							
								
									d3e2478d6b
								
							
						
					
					
						commit
						a2ae839704
					
				
					 3 changed files with 60 additions and 2 deletions
				
			
		
							
								
								
									
										49
									
								
								gen/indexer.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										49
									
								
								gen/indexer.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,49 @@
 | 
				
			||||||
 | 
					'''This file defines code for extracting, from field values, the text to be
 | 
				
			||||||
 | 
					   indexed.'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ------------------------------------------------------------------------------
 | 
				
			||||||
 | 
					from Products.ZCTextIndex.PipelineFactory import element_factory
 | 
				
			||||||
 | 
					from appy.shared.xml_parser import XmlParser
 | 
				
			||||||
 | 
					from appy.shared.utils import normalizeString
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ------------------------------------------------------------------------------
 | 
				
			||||||
 | 
					class XhtmlTextExtractor(XmlParser):
 | 
				
			||||||
 | 
					    '''Extracts text from XHTML.'''
 | 
				
			||||||
 | 
					    def startDocument(self):
 | 
				
			||||||
 | 
					        XmlParser.startDocument(self)
 | 
				
			||||||
 | 
					        self.res = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def endDocument(self):
 | 
				
			||||||
 | 
					        self.res = ' '.join(self.res)
 | 
				
			||||||
 | 
					        return XmlParser.endDocument(self)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def characters(self, content):
 | 
				
			||||||
 | 
					        c = normalizeString(content, usage='extractedText').strip().lower()
 | 
				
			||||||
 | 
					        if len(c) > 1: self.res.append(c)
 | 
				
			||||||
 | 
					        return self.env
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Do not raise exceptions when errors occur.
 | 
				
			||||||
 | 
					    def error(self, error): pass
 | 
				
			||||||
 | 
					    def fatalError(self, error): pass
 | 
				
			||||||
 | 
					    def warning(self, error): pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ------------------------------------------------------------------------------
 | 
				
			||||||
 | 
					class XhtmlIndexer:
 | 
				
			||||||
 | 
					    '''Extracts, from XHTML field values, the text to index.'''
 | 
				
			||||||
 | 
					    def process(self, text):
 | 
				
			||||||
 | 
					        # Wrap the XHTML chunk into a root tag, to get valid XML.
 | 
				
			||||||
 | 
					        text = '<p>%s</p>' % text[0]
 | 
				
			||||||
 | 
					        parser = XhtmlTextExtractor()
 | 
				
			||||||
 | 
					        text = parser.parse(text)
 | 
				
			||||||
 | 
					        res = text.split(' ')
 | 
				
			||||||
 | 
					        # Remove tokens of a single char.
 | 
				
			||||||
 | 
					        i = len(res)-1
 | 
				
			||||||
 | 
					        while i > -1 :
 | 
				
			||||||
 | 
					            if (len(res[i]) < 2) and not res[i].isdigit():
 | 
				
			||||||
 | 
					                del res[i]
 | 
				
			||||||
 | 
					            i -= 1
 | 
				
			||||||
 | 
					        return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ------------------------------------------------------------------------------
 | 
				
			||||||
 | 
					element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
 | 
				
			||||||
 | 
					# ------------------------------------------------------------------------------
 | 
				
			||||||
| 
						 | 
					@ -206,6 +206,9 @@ def normalizeString(s, usage='fileName'):
 | 
				
			||||||
    # We work in unicode. Convert p_s to unicode if not unicode.
 | 
					    # We work in unicode. Convert p_s to unicode if not unicode.
 | 
				
			||||||
    if isinstance(s, str):           s = s.decode('utf-8')
 | 
					    if isinstance(s, str):           s = s.decode('utf-8')
 | 
				
			||||||
    elif not isinstance(s, unicode): s = unicode(s)
 | 
					    elif not isinstance(s, unicode): s = unicode(s)
 | 
				
			||||||
 | 
					    if usage == 'extractedText':
 | 
				
			||||||
 | 
					        # Replace single quotes with blanks.
 | 
				
			||||||
 | 
					        s = s.replace("'", " ").replace(u'’', ' ')
 | 
				
			||||||
    # Remove any special char like accents.
 | 
					    # Remove any special char like accents.
 | 
				
			||||||
    s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
 | 
					    s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
 | 
				
			||||||
    # Remove any other char, depending on p_usage.
 | 
					    # Remove any other char, depending on p_usage.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,8 +19,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ------------------------------------------------------------------------------
 | 
					# ------------------------------------------------------------------------------
 | 
				
			||||||
import xml.sax, difflib, types, cgi
 | 
					import xml.sax, difflib, types, cgi
 | 
				
			||||||
from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
 | 
					from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges
 | 
				
			||||||
                            property_interning_dict
 | 
					 | 
				
			||||||
from xml.sax.xmlreader import InputSource
 | 
					from xml.sax.xmlreader import InputSource
 | 
				
			||||||
from xml.sax import SAXParseException
 | 
					from xml.sax import SAXParseException
 | 
				
			||||||
from appy.shared import UnicodeBuffer, xmlPrologue
 | 
					from appy.shared import UnicodeBuffer, xmlPrologue
 | 
				
			||||||
| 
						 | 
					@ -170,6 +169,13 @@ class XmlParser(ContentHandler, ErrorHandler):
 | 
				
			||||||
            # Put a question mark instead of raising an exception.
 | 
					            # Put a question mark instead of raising an exception.
 | 
				
			||||||
            self.characters('?')
 | 
					            self.characters('?')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # ErrorHandler methods ---------------------------------------------------
 | 
				
			||||||
 | 
					    # Define methods below in your subclass if you want error handling that
 | 
				
			||||||
 | 
					    # does not raise exceptions, but produces a partial result instead.
 | 
				
			||||||
 | 
					    #def error(self, error): pass
 | 
				
			||||||
 | 
					    #def fatalError(self, error): pass
 | 
				
			||||||
 | 
					    #def warning(self, error): pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def parse(self, xml, source='string'):
 | 
					    def parse(self, xml, source='string'):
 | 
				
			||||||
        '''Parses a XML stream.
 | 
					        '''Parses a XML stream.
 | 
				
			||||||
           * If p_source is "string", p_xml must be a string containing
 | 
					           * If p_source is "string", p_xml must be a string containing
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue