[gen] Added the possiblity to extract in a clean way text from XHTML field values.
This commit is contained in:
parent
d3e2478d6b
commit
a2ae839704
49
gen/indexer.py
Normal file
49
gen/indexer.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
'''This file defines code for extracting, from field values, the text to be
|
||||||
|
indexed.'''
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
from Products.ZCTextIndex.PipelineFactory import element_factory
|
||||||
|
from appy.shared.xml_parser import XmlParser
|
||||||
|
from appy.shared.utils import normalizeString
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
class XhtmlTextExtractor(XmlParser):
|
||||||
|
'''Extracts text from XHTML.'''
|
||||||
|
def startDocument(self):
|
||||||
|
XmlParser.startDocument(self)
|
||||||
|
self.res = []
|
||||||
|
|
||||||
|
def endDocument(self):
|
||||||
|
self.res = ' '.join(self.res)
|
||||||
|
return XmlParser.endDocument(self)
|
||||||
|
|
||||||
|
def characters(self, content):
|
||||||
|
c = normalizeString(content, usage='extractedText').strip().lower()
|
||||||
|
if len(c) > 1: self.res.append(c)
|
||||||
|
return self.env
|
||||||
|
|
||||||
|
# Do not raise exceptions when errors occur.
|
||||||
|
def error(self, error): pass
|
||||||
|
def fatalError(self, error): pass
|
||||||
|
def warning(self, error): pass
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
class XhtmlIndexer:
|
||||||
|
'''Extracts, from XHTML field values, the text to index.'''
|
||||||
|
def process(self, text):
|
||||||
|
# Wrap the XHTML chunk into a root tag, to get valid XML.
|
||||||
|
text = '<p>%s</p>' % text[0]
|
||||||
|
parser = XhtmlTextExtractor()
|
||||||
|
text = parser.parse(text)
|
||||||
|
res = text.split(' ')
|
||||||
|
# Remove tokens of a single char.
|
||||||
|
i = len(res)-1
|
||||||
|
while i > -1 :
|
||||||
|
if (len(res[i]) < 2) and not res[i].isdigit():
|
||||||
|
del res[i]
|
||||||
|
i -= 1
|
||||||
|
return res
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
|
||||||
|
# ------------------------------------------------------------------------------
|
|
@ -206,6 +206,9 @@ def normalizeString(s, usage='fileName'):
|
||||||
# We work in unicode. Convert p_s to unicode if not unicode.
|
# We work in unicode. Convert p_s to unicode if not unicode.
|
||||||
if isinstance(s, str): s = s.decode('utf-8')
|
if isinstance(s, str): s = s.decode('utf-8')
|
||||||
elif not isinstance(s, unicode): s = unicode(s)
|
elif not isinstance(s, unicode): s = unicode(s)
|
||||||
|
if usage == 'extractedText':
|
||||||
|
# Replace single quotes with blanks.
|
||||||
|
s = s.replace("'", " ").replace(u'’', ' ')
|
||||||
# Remove any special char like accents.
|
# Remove any special char like accents.
|
||||||
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
|
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
|
||||||
# Remove any other char, depending on p_usage.
|
# Remove any other char, depending on p_usage.
|
||||||
|
|
|
@ -19,8 +19,7 @@
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
import xml.sax, difflib, types, cgi
|
import xml.sax, difflib, types, cgi
|
||||||
from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
|
from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges
|
||||||
property_interning_dict
|
|
||||||
from xml.sax.xmlreader import InputSource
|
from xml.sax.xmlreader import InputSource
|
||||||
from xml.sax import SAXParseException
|
from xml.sax import SAXParseException
|
||||||
from appy.shared import UnicodeBuffer, xmlPrologue
|
from appy.shared import UnicodeBuffer, xmlPrologue
|
||||||
|
@ -170,6 +169,13 @@ class XmlParser(ContentHandler, ErrorHandler):
|
||||||
# Put a question mark instead of raising an exception.
|
# Put a question mark instead of raising an exception.
|
||||||
self.characters('?')
|
self.characters('?')
|
||||||
|
|
||||||
|
# ErrorHandler methods ---------------------------------------------------
|
||||||
|
# Define methods below in your subclass if you want error handling that
|
||||||
|
# does not raise exceptions, but produces a partial result instead.
|
||||||
|
#def error(self, error): pass
|
||||||
|
#def fatalError(self, error): pass
|
||||||
|
#def warning(self, error): pass
|
||||||
|
|
||||||
def parse(self, xml, source='string'):
|
def parse(self, xml, source='string'):
|
||||||
'''Parses a XML stream.
|
'''Parses a XML stream.
|
||||||
* If p_source is "string", p_xml must be a string containing
|
* If p_source is "string", p_xml must be a string containing
|
||||||
|
|
Loading…
Reference in a new issue